From 064b2eeb52ef5abb1b1134784069d681af71383c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Mon, 2 Feb 2026 14:52:40 -0800 Subject: [PATCH 01/37] initial commit as from JathavanSriramNVIDIA --- applications/osmo/.gitignore | 50 + applications/osmo/LICENSE | 176 +++ applications/osmo/README.md | 489 ++++++ .../osmo/deploy/000-prerequisites/README.md | 291 ++++ .../deploy/000-prerequisites/install-tools.sh | 311 ++++ .../000-prerequisites/nebius-env-init.sh | 418 ++++++ .../deploy/000-prerequisites/secrets-init.sh | 380 +++++ .../wireguard-client-setup.sh | 216 +++ applications/osmo/deploy/001-iac/README.md | 245 +++ applications/osmo/deploy/001-iac/locals.tf | 77 + applications/osmo/deploy/001-iac/main.tf | 134 ++ .../osmo/deploy/001-iac/modules/k8s/main.tf | 176 +++ .../deploy/001-iac/modules/k8s/outputs.tf | 38 + .../modules/k8s/templates/cloud-init.yaml | 26 + .../deploy/001-iac/modules/k8s/variables.tf | 192 +++ .../deploy/001-iac/modules/k8s/versions.tf | 7 + .../deploy/001-iac/modules/platform/main.tf | 215 +++ .../001-iac/modules/platform/outputs.tf | 140 ++ .../001-iac/modules/platform/variables.tf | 208 +++ .../001-iac/modules/platform/versions.tf | 11 + .../deploy/001-iac/modules/wireguard/main.tf | 70 + .../001-iac/modules/wireguard/outputs.tf | 28 + .../wireguard/templates/cloud-init.yaml | 122 ++ .../001-iac/modules/wireguard/variables.tf | 91 ++ .../001-iac/modules/wireguard/versions.tf | 7 + applications/osmo/deploy/001-iac/outputs.tf | 168 +++ ...aform.tfvars.cost-optimized-secure.example | 107 ++ .../terraform.tfvars.cost-optimized.example | 86 ++ .../deploy/001-iac/terraform.tfvars.example | 72 + .../terraform.tfvars.production.example | 84 ++ .../001-iac/terraform.tfvars.secure.example | 87 ++ applications/osmo/deploy/001-iac/variables.tf | 437 ++++++ applications/osmo/deploy/001-iac/versions.tf | 25 + .../002-setup/01-deploy-gpu-infrastructure.sh | 111 ++ .../002-setup/02-deploy-observability.sh | 103 ++ .../002-setup/03-deploy-osmo-control-plane.sh | 1319 +++++++++++++++++ .../002-setup/04-deploy-osmo-backend.sh | 289 ++++ .../deploy/002-setup/05-configure-storage.sh | 251 ++++ .../002-setup/06-configure-service-url.sh | 129 ++ .../002-setup/07-configure-gpu-platform.sh | 129 ++ applications/osmo/deploy/002-setup/README.md | 358 +++++ .../cleanup/uninstall-gpu-infrastructure.sh | 43 + .../cleanup/uninstall-observability.sh | 48 + .../cleanup/uninstall-osmo-backend.sh | 30 + .../cleanup/uninstall-osmo-control-plane.sh | 34 + .../osmo/deploy/002-setup/defaults.sh | 37 + .../deploy/002-setup/gpu_platform_update.json | 14 + .../deploy/002-setup/gpu_pod_template.json | 16 + .../osmo/deploy/002-setup/lib/common.sh | 199 +++ .../osmo/deploy/002-setup/nginx-proxy.yaml | 120 ++ .../deploy/002-setup/osmo-values-noauth.yaml | 170 +++ .../deploy/002-setup/values/gpu-operator.yaml | 57 + .../osmo/deploy/002-setup/values/grafana.yaml | 70 + .../002-setup/values/kai-scheduler.yaml | 13 + .../osmo/deploy/002-setup/values/loki.yaml | 68 + .../002-setup/values/network-operator.yaml | 62 + .../values/osmo-backend-operator.yaml | 37 + .../deploy/002-setup/values/prometheus.yaml | 107 ++ .../deploy/002-setup/values/promtail.yaml | 46 + applications/osmo/deploy/README.md | 168 +++ applications/osmo/workflows/README.md | 156 ++ .../osmo/workflows/osmo/gpu_test.yaml | 56 + .../osmo/workflows/osmo/hello_nebius.yaml | 30 + 63 files changed, 9454 insertions(+) create mode 100755 applications/osmo/.gitignore create mode 100755 applications/osmo/LICENSE create mode 100755 applications/osmo/README.md create mode 100755 applications/osmo/deploy/000-prerequisites/README.md create mode 100755 applications/osmo/deploy/000-prerequisites/install-tools.sh create mode 100755 applications/osmo/deploy/000-prerequisites/nebius-env-init.sh create mode 100755 applications/osmo/deploy/000-prerequisites/secrets-init.sh create mode 100755 applications/osmo/deploy/000-prerequisites/wireguard-client-setup.sh create mode 100755 applications/osmo/deploy/001-iac/README.md create mode 100755 applications/osmo/deploy/001-iac/locals.tf create mode 100755 applications/osmo/deploy/001-iac/main.tf create mode 100755 applications/osmo/deploy/001-iac/modules/k8s/main.tf create mode 100755 applications/osmo/deploy/001-iac/modules/k8s/outputs.tf create mode 100755 applications/osmo/deploy/001-iac/modules/k8s/templates/cloud-init.yaml create mode 100755 applications/osmo/deploy/001-iac/modules/k8s/variables.tf create mode 100755 applications/osmo/deploy/001-iac/modules/k8s/versions.tf create mode 100755 applications/osmo/deploy/001-iac/modules/platform/main.tf create mode 100755 applications/osmo/deploy/001-iac/modules/platform/outputs.tf create mode 100755 applications/osmo/deploy/001-iac/modules/platform/variables.tf create mode 100755 applications/osmo/deploy/001-iac/modules/platform/versions.tf create mode 100755 applications/osmo/deploy/001-iac/modules/wireguard/main.tf create mode 100755 applications/osmo/deploy/001-iac/modules/wireguard/outputs.tf create mode 100755 applications/osmo/deploy/001-iac/modules/wireguard/templates/cloud-init.yaml create mode 100755 applications/osmo/deploy/001-iac/modules/wireguard/variables.tf create mode 100755 applications/osmo/deploy/001-iac/modules/wireguard/versions.tf create mode 100755 applications/osmo/deploy/001-iac/outputs.tf create mode 100755 applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized-secure.example create mode 100755 applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example create mode 100755 applications/osmo/deploy/001-iac/terraform.tfvars.example create mode 100755 applications/osmo/deploy/001-iac/terraform.tfvars.production.example create mode 100755 applications/osmo/deploy/001-iac/terraform.tfvars.secure.example create mode 100755 applications/osmo/deploy/001-iac/variables.tf create mode 100755 applications/osmo/deploy/001-iac/versions.tf create mode 100755 applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh create mode 100755 applications/osmo/deploy/002-setup/02-deploy-observability.sh create mode 100755 applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh create mode 100755 applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh create mode 100755 applications/osmo/deploy/002-setup/05-configure-storage.sh create mode 100755 applications/osmo/deploy/002-setup/06-configure-service-url.sh create mode 100755 applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh create mode 100755 applications/osmo/deploy/002-setup/README.md create mode 100755 applications/osmo/deploy/002-setup/cleanup/uninstall-gpu-infrastructure.sh create mode 100755 applications/osmo/deploy/002-setup/cleanup/uninstall-observability.sh create mode 100755 applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh create mode 100755 applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-control-plane.sh create mode 100755 applications/osmo/deploy/002-setup/defaults.sh create mode 100755 applications/osmo/deploy/002-setup/gpu_platform_update.json create mode 100755 applications/osmo/deploy/002-setup/gpu_pod_template.json create mode 100755 applications/osmo/deploy/002-setup/lib/common.sh create mode 100755 applications/osmo/deploy/002-setup/nginx-proxy.yaml create mode 100755 applications/osmo/deploy/002-setup/osmo-values-noauth.yaml create mode 100755 applications/osmo/deploy/002-setup/values/gpu-operator.yaml create mode 100755 applications/osmo/deploy/002-setup/values/grafana.yaml create mode 100755 applications/osmo/deploy/002-setup/values/kai-scheduler.yaml create mode 100755 applications/osmo/deploy/002-setup/values/loki.yaml create mode 100755 applications/osmo/deploy/002-setup/values/network-operator.yaml create mode 100755 applications/osmo/deploy/002-setup/values/osmo-backend-operator.yaml create mode 100755 applications/osmo/deploy/002-setup/values/prometheus.yaml create mode 100755 applications/osmo/deploy/002-setup/values/promtail.yaml create mode 100755 applications/osmo/deploy/README.md create mode 100755 applications/osmo/workflows/README.md create mode 100755 applications/osmo/workflows/osmo/gpu_test.yaml create mode 100755 applications/osmo/workflows/osmo/hello_nebius.yaml diff --git a/applications/osmo/.gitignore b/applications/osmo/.gitignore new file mode 100755 index 000000000..47334b4a7 --- /dev/null +++ b/applications/osmo/.gitignore @@ -0,0 +1,50 @@ +# Terraform +*.tfstate +*.tfstate.* +*.tfvars +!*.tfvars.example +!*.tfvars.*.example +.terraform/ +.terraform.lock.hcl +*.out +crash.log +crash.*.log +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# SSH keys +*.pem +id_rsa* +*.key + +# Secrets +*.secret +.env +.env.* +!.env.example + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log + +# Kubernetes +kubeconfig +kubeconfig.* +!kubeconfig.example + +# WireGuard +*.conf +!*.conf.example +wg-client-*.conf diff --git a/applications/osmo/LICENSE b/applications/osmo/LICENSE new file mode 100755 index 000000000..2b8f06340 --- /dev/null +++ b/applications/osmo/LICENSE @@ -0,0 +1,176 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, +and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by +the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all +other entities that control, are controlled by, or are under common +control with that entity. For the purposes of this definition, +"control" means (i) the power, direct or indirect, to cause the +direction or management of such entity, whether by contract or +otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity +exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, +including but not limited to software source code, documentation +source, and configuration files. + +"Object" form shall mean any form resulting from mechanical +transformation or translation of a Source form, including but +not limited to compiled object code, generated documentation, +and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or +Object form, made available under the License, as indicated by a +copyright notice that is included in or attached to the work +(an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object +form, that is based on (or derived from) the Work and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. For the purposes +of this License, Derivative Works shall not include works that remain +separable from, or merely link (or bind by name) to the interfaces of, +the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including +the original version of the Work and any modifications or additions +to that Work or Derivative Works thereof, that is intentionally +submitted to the Licensor for inclusion in the Work by the copyright owner +or by an individual or Legal Entity authorized to submit on behalf of +the copyright owner. For the purposes of this definition, "submitted" +means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, +and issue tracking systems that are managed by, or on behalf of, the +Licensor for the purpose of discussing and improving the Work, but +excluding communication that is conspicuously marked or otherwise +designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity +on behalf of whom a Contribution has been received by Licensor and +subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the +Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except as stated in this section) patent license to make, have made, +use, offer to sell, sell, import, and otherwise transfer the Work, +where such license applies only to those patent claims licensable +by such Contributor that are necessarily infringed by their +Contribution(s) alone or by combination of their Contribution(s) +with the Work to which such Contribution(s) was submitted. If You +institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work +or a Contribution incorporated within the Work constitutes direct +or contributory patent infringement, then any patent licenses +granted to You under this License for that Work shall terminate +as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the +Work or Derivative Works thereof in any medium, with or without +modifications, and in Source or Object form, provided that You +meet the following conditions: + +(a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + +(b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + +(c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + +(d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + +You may add Your own copyright statement to Your modifications and +may provide additional or different license terms and conditions +for use, reproduction, or distribution of Your modifications, or +for any such Derivative Works as a whole, provided Your use, +reproduction, and distribution of the Work otherwise complies with +the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, +any Contribution intentionally submitted for inclusion in the Work +by You to the Licensor shall be under the terms and conditions of +this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify +the terms of any separate license agreement you may have executed +with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade +names, trademarks, service marks, or product names of the Licensor, +except as required for reasonable and customary use in describing the +origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or +agreed to in writing, Licensor provides the Work (and each +Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied, including, without limitation, any warranties or conditions +of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. You are solely responsible for determining the +appropriateness of using or redistributing the Work and assume any +risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, +whether in tort (including negligence), contract, or otherwise, +unless required by applicable law (such as deliberate and grossly +negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, +incidental, or consequential damages of any character arising as a +result of this License or out of the use or inability to use the +Work (including but not limited to damages for loss of goodwill, +work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses), even if such Contributor +has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing +the Work or Derivative Works thereof, You may choose to offer, +and charge a fee for, acceptance of support, warranty, indemnity, +or other liability obligations and/or rights consistent with this +License. However, in accepting such obligations, You may act only +on Your own behalf and on Your sole responsibility, not on behalf +of any other Contributor, and only if You agree to indemnify, +defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason +of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS diff --git a/applications/osmo/README.md b/applications/osmo/README.md new file mode 100755 index 000000000..41e0e209a --- /dev/null +++ b/applications/osmo/README.md @@ -0,0 +1,489 @@ +# Physical AI Workflow Orchestration on Nebius Cloud + +Deploy [NVIDIA OSMO](https://nvidia.github.io/OSMO/main/user_guide/index.html) on [Nebius AI Cloud](https://nebius.com/ai-cloud) in minutes. Run simulation, training, and edge workflows on the wide variety of Nebius GPU instances—write once in YAML, run anywhere. + +## Tested in/with +- eu-north-1 + +## Known Gaps and TODOs + +| Gap | Current Workaround | Status | +|-----|-------------------|--------| +| Multi-region support | Code tested only in eu-north1; other regions have different GPU platforms (H100/H200/L40S), CPU platforms (cpu-d3 vs cpu-e2), disk types, and PostgreSQL presets | TODO | +| No managed Redis service | Deploy Redis in-cluster via Helm | Workaround in place | +| MysteryBox lacks K8s CSI integration | Scripts retrieve secrets and create K8s secrets manually | Workaround in place | +| No External DNS service | Manual DNS configuration required | Not addressed | +| No managed SSL/TLS service | Manual certificate management | Not addressed | +| No public Load Balancer (ALB/NLB) | Use port-forwarding or WireGuard VPN for access | Workaround in place | +| IDP integration for Nebius | Using OSMO dev auth mode; Keycloak available but not integrated | TBD | +| Nebius Observability Stack integration | Using self-deployed Prometheus/Grafana/Loki | TODO | +| Single cluster for Control Plane + Backend | Using 1 MK8s cluster for both; production separation TBD | Discuss with Nebius | + +## What You Get + +Production-ready infrastructure-as-code (Terraform) and setup scripts for: +- **Managed Kubernetes (MK8s)** cluster with GPU and CPU node groups +- **GPU Infrastructure** including GPU Operator, Network Operator, and KAI Scheduler +- **Observability Stack** with Prometheus, Grafana, and Loki +- **OSMO Control Plane and Backend** for workflow orchestration +- **Supporting Services** including PostgreSQL, Object Storage, Filestore, and Container Registry +- **Secure Access** via WireGuard VPN (optional) + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Nebius AI Cloud │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Nebius VPC Network │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌────────────────────────────────────────────────┐ │ │ +│ │ │ WireGuard │ │ Nebius Managed Kubernetes (MK8s) │ │ │ +│ │ │ VPN │ │ │ │ │ +│ │ │ (Optional) │ │ ┌────────────────────────────────────────┐ │ │ │ +│ │ └──────┬──────┘ │ │ OSMO Namespace │ │ │ │ +│ │ │ │ │ ┌──────────┐ ┌────────┐ ┌──────────┐ │ │ │ │ +│ │ │ │ │ │ osmo- │ │ osmo- │ │ osmo- │ │ │ │ │ +│ │ │ │ │ │ service │ │ logger │ │ agent │ │ │ │ │ +│ │ │ │ │ └────┬─────┘ └───┬────┘ └────┬─────┘ │ │ │ │ +│ │ │ │ │ └────────────┼──────────┘ │ │ │ │ +│ │ │ │ │ ┌─────┴─────┐ │ │ │ │ +│ │ │ │ │ │osmo-proxy │ │ │ │ │ +│ │ │ │ │ │ (nginx) │ │ │ │ │ +│ │ │ │ │ └─────┬─────┘ │ │ │ │ +│ │ │ │ │ │ │ │ │ │ +│ │ │ │ │ ┌─────────┐ ┌────┴────┐ ┌─────────┐ │ │ │ │ +│ │ │ │ │ │ osmo-ui │ │osmo-ctrl│ │osmo- │ │ │ │ │ +│ │ │ │ │ │ (Web UI)│ │(sidecar)│ │backend │ │ │ │ │ +│ │ │ │ │ └─────────┘ └─────────┘ └─────────┘ │ │ │ │ +│ │ │ │ └────────────────────────────────────────┘ │ │ │ +│ │ │ │ │ │ │ +│ │ └─────────┼───► ┌──────────────┐ ┌───────────────────┐ │ │ │ +│ │ │ │ CPU Nodes │ │ GPU Nodes │ │ │ │ +│ │ │ │ (cpu-d3) │ │ (L40S/H100/H200) │ │ │ │ +│ │ │ │ System pods │ │ Workflow pods │ │ │ │ +│ │ │ └──────────────┘ └───────────────────┘ │ │ │ +│ │ │ │ │ │ +│ │ │ ┌────────────────────────────────────────┐ │ │ │ +│ │ │ │ Infrastructure Stack │ │ │ │ +│ │ │ │ GPU Operator, Network Operator, Cilium│ │ │ │ +│ │ │ │ Prometheus, Grafana, Loki │ │ │ │ +│ │ │ └────────────────────────────────────────┘ │ │ │ +│ │ └────────────────────────────────────────────────┘ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ +│ │ Managed │ │ Object │ │ Shared │ │ Container │ │ +│ │ PostgreSQL │ │ Storage │ │ Filesystems │ │ Registry │ │ +│ │ (OSMO DB) │ │ (Workflow │ │ (Datasets) │ │ (Images) │ │ +│ │ │ │ logs/data) │ │ │ │ │ │ +│ └───────────────┘ └───────────────┘ └───────────────┘ └───────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +**Nebius Services Used:** + +| Service | Purpose | +|---------|---------| +| MK8s | Managed Kubernetes with CPU and GPU node groups | +| Managed PostgreSQL | Database for OSMO state and metadata | +| Object Storage | S3-compatible storage for workflow logs and artifacts | +| Shared Filesystems | NFS storage for datasets across nodes | +| Compute | VMs for WireGuard VPN (optional) | +| VPC | Private networking with subnet isolation | +| IAM | Service accounts and access keys | +| MysteryBox | Secrets management for credentials | +| Container Registry | Docker image storage for custom workflow images | + +## Prerequisites + +Before deploying, ensure you have access to Nebius AI Cloud and the required command-line tools installed. The deployment uses Terraform for infrastructure provisioning and Helm/kubectl for Kubernetes configuration. + +- [**Nebius Account**](https://console.eu.nebius.com/) with appropriate permissions (see [Required Permissions](#required-permissions)) +- [**Nebius CLI**](https://docs.nebius.com/cli/install) installed and authenticated +- [**Terraform**](https://developer.hashicorp.com/terraform/install) >= 1.5.0 for infrastructure-as-code +- [**kubectl**](https://kubernetes.io/docs/tasks/tools/) >= 1.28 for Kubernetes cluster management (should match cluster version ±1 minor) +- [**Helm**](https://helm.sh/docs/intro/install/) >= 3.0 for deploying OSMO charts +- **SSH key pair** for node access (generate with `ssh-keygen` if needed) + +## Quick Start + +> **Important:** Complete all steps in the **same terminal session**. The setup scripts export environment variables that must persist across steps. + +Please run this from a Linux Shell/Ubuntu/WSL. + +### 1. Install Required Tools + +```bash +cd deploy/000-prerequisites +./install-tools.sh # Installs: Terraform, kubectl, Helm, Nebius CLI, OSMO CLI +./install-tools.sh --check # Verify without installing +``` + +Supports Linux, WSL, and macOS. Requires Python/pip for OSMO CLI installation. See [prerequisites README](deploy/000-prerequisites/README.md) for manual installation. + +### 2. Configure Nebius Environment + +> **Note:** If not authenticated, run `nebius profile create` first and follow the authentication flow. + +```bash +source ./nebius-env-init.sh +``` + +This interactive script: +1. **Checks Nebius CLI** - Verifies installation and adds to PATH if needed +2. **Checks authentication** - If not authenticated, provides instructions to run `nebius profile create` +3. **Lists tenants** - Auto-detects if you have only one tenant +4. **Configures project** - Select existing project, create new one, or list available projects +5. **Sets region** - Choose between `eu-north1` (Finland) or `eu-west1` (Paris) +6. **Exports environment variables** - Sets `NEBIUS_*` and `TF_VAR_*` variables for Terraform + +### 3. Initialize Secrets (REQUIRED) + +```bash +source ./secrets-init.sh +``` + +> **Important:** This step is **REQUIRED** before running Terraform. If you skip it, `terraform apply` will fail with a clear error message. + +This generates secure credentials and stores them in [Nebius MysteryBox](https://docs.nebius.com/mysterybox): +- **PostgreSQL password** - Used by Managed PostgreSQL and OSMO +- **MEK (Master Encryption Key)** - Used by OSMO for data encryption + +The script exports `TF_VAR_*` environment variables that Terraform and setup scripts use to retrieve these secrets securely, keeping them out of Terraform state. + +### 4. Deploy Infrastructure + +Provision all Nebius cloud resources using Terraform: VPC network, Managed Kubernetes cluster, GPU/CPU node groups, PostgreSQL database, Object Storage, and optionally WireGuard VPN. + +```bash +cd ../001-iac + +# Recommended: Cost-optimized for development (see Appendix A) +cp terraform.tfvars.cost-optimized.example terraform.tfvars + +# Edit terraform.tfvars if needed +terraform init +terraform plan -out plan.out +terraform apply plan.out +``` + +> **Note:** If you get an error about missing `postgresql_mysterybox_secret_id`, go back to step 3 and run `source ./secrets-init.sh`. + +See [Terraform README](deploy/001-iac/README.md) for configuration options, and [Appendix A](#appendix-a-terraform-configuration-presets) for preset comparisons. + +### 5. Configure Kubernetes + +1. Get Kubernetes credentials: + ```bash + nebius mk8s cluster get-credentials --id --external + ``` + +2. Verify cluster access: + ```bash + kubectl get nodes + ``` + +3. Deploy GPU infrastructure and observability: + ```bash + cd ../002-setup + ./01-deploy-gpu-infrastructure.sh + ./02-deploy-observability.sh + ``` + + This installs: + - NVIDIA GPU Operator and Network Operator + - KAI Scheduler for GPU workload scheduling + - Prometheus, Grafana, and Loki for monitoring + +4. Deploy OSMO control plane: + ```bash + ./03-deploy-osmo-control-plane.sh + ``` + + This deploys the core OSMO services: + - Creates `osmo` namespace and PostgreSQL/MEK secrets + - Initializes databases on Nebius Managed PostgreSQL + - Deploys Redis and OSMO services (API, agent, worker, logger) + - Sets up nginx proxy for routing + + > **Note:** The script automatically retrieves PostgreSQL password and MEK from MysteryBox if you ran `secrets-init.sh` earlier. + +5. Deploy OSMO backend operator: + ```bash + ./04-deploy-osmo-backend.sh + ``` + + The script automatically: + - Starts a port-forward to OSMO service + - Logs in using dev method (since Keycloak auth is disabled) + - Creates a service token for the backend operator + - Deploys the backend operator + - Cleans up the port-forward + + This deploys the backend operator that manages GPU workloads: + - Connects to OSMO control plane via `osmo-agent` + - Configures resource pools for GPU nodes + - Enables workflow execution on the Kubernetes cluster + + > **Manual alternative:** If you prefer to create the token manually, set `OSMO_SERVICE_TOKEN` environment variable before running the script. + +6. Verify backend deployment: + + To verify the backend is registered with OSMO, start a port-forward and check: + ```bash + # Terminal 1: Start port-forward (keep running) + kubectl port-forward -n osmo svc/osmo-service 8080:80 + + # Terminal 2: Verify backend registration + osmo config show BACKEND default + ``` + + You should see the backend configuration with status `ONLINE`. + +7. Configure OSMO storage: + ```bash + ./05-configure-storage.sh + ``` + + The script automatically: + - Retrieves storage bucket details from Terraform + - Starts port-forward and logs in to OSMO + - Configures OSMO to use Nebius Object Storage for workflow artifacts + - Verifies the configuration + + > **Note:** The `osmo-storage` secret (with S3 credentials) was created automatically by `03-deploy-osmo-control-plane.sh`. + +8. Access OSMO (port-forwarding): + + Since the cluster uses private networking, use port-forwarding to access OSMO services: + + ```bash + # Terminal 1: Forward OSMO API (required for CLI commands) + kubectl port-forward -n osmo svc/osmo-service 8080:80 + + # Terminal 2: Forward OSMO Web UI + kubectl port-forward -n osmo svc/osmo-ui 8081:80 + ``` + + Access points: + - **OSMO API**: http://localhost:8080 (for CLI and API calls) + - **OSMO Web UI**: http://localhost:8081 (browser-based dashboard) + + Login to OSMO CLI (required before running commands): + ```bash + osmo login http://localhost:8080 --method dev --username admin + ``` + +9. Configure service URL (required for workflows): + ```bash + ./06-configure-service-url.sh + ``` + + The script configures `service_base_url` which is required for: + - The `osmo-ctrl` sidecar to stream workflow logs + - Task status reporting and completion tracking + - Authentication token refresh during workflow execution + + > **Important:** Without this configuration, workflows will get stuck with `FETCH_FAILURE` errors. + +10. Configure pool for GPU workloads: + + The default pool needs GPU platform configuration to run GPU workflows. This creates a pod template with the correct node selector and tolerations for GPU nodes: + + ```bash + ./07-configure-gpu-platform.sh + ``` + + The script: + - Creates a `gpu_tolerations` pod template with `nvidia.com/gpu.present: true` node selector + - Updates the GPU platform to reference this pod template + - Verifies GPU nodes are visible in OSMO + + You can verify the configuration: + ```bash + osmo config show POOL default + osmo config show POD_TEMPLATE gpu_tolerations + ``` + +11. Set up port-forwarding for OSMO access: + + Before using the OSMO CLI or Web UI, set up port-forwarding to the OSMO services: + + ```bash + # Terminal 1: Port-forward to OSMO API (required for CLI and API access) + kubectl port-forward -n osmo svc/osmo-service 8080:80 + + # Terminal 2: Port-forward to OSMO Web UI (optional, for browser access) + kubectl port-forward -n osmo svc/osmo-ui 8081:80 + ``` + + Then configure the OSMO CLI to use the forwarded port: + ```bash + osmo profile set endpoint http://localhost:8080 + ``` + + Access points: + - **OSMO API**: http://localhost:8080 + - **OSMO Web UI**: http://localhost:8081 + +12. Run a test workflow (optional): + + Verify the complete setup by running a test workflow from the `workflows/osmo/` directory: + + ```bash + # Set the default pool (required before submitting workflows) + osmo profile set pool default + + # Submit the hello world workflow (requires GPU) + osmo workflow submit ../../workflows/osmo/hello_nebius.yaml + + # Or specify the pool explicitly + osmo workflow submit ../../workflows/osmo/hello_nebius.yaml --pool default + + # Check workflow status + osmo workflow list + osmo workflow query + + # View workflow logs (CLI - recommended when using port-forwarding) + osmo workflow query --logs + ``` + + > **Note:** When using port-forwarding, the Web UI cannot display workflow logs (it tries to resolve internal Kubernetes DNS). Use the CLI commands above or `kubectl logs` instead. + + Available test workflows in `workflows/osmo/`: + - `hello_nebius.yaml` - Simple GPU hello world + - `gpu_test.yaml` - GPU validation test + + + +## Configuration Tiers + +| Tier | GPU Type | GPU Nodes | Security | Est. Cost/6h | +|------|----------|-----------|----------|--------------| +| **Cost-Optimized Secure** (recommended) | 1x L40S | 1 | WireGuard VPN | **~$15-25** | +| **Cost-Optimized** | 1x L40S | 1 | Public endpoints | ~$10-15 | +| **Standard** | 1x H100 | 1 | Public endpoints | ~$30-40 | +| **Production** | 8x H200 | 4+ | WireGuard VPN | ~$1000+ | + +**Recommended:** Use `terraform.tfvars.cost-optimized-secure.example` for development. + +See `deploy/001-iac/terraform.tfvars.*.example` files for all configuration options. + +## GPU Options + +| Platform | Preset | GPUs | VRAM | vCPUs | RAM | InfiniBand | +|----------|--------|------|------|-------|-----|------------| +| `gpu-l40s-a` | `1gpu-8vcpu-32gb` | 1 | 48GB | 8 | 32GB | No | +| `gpu-l40s-d` | `1gpu-8vcpu-32gb` | 1 | 48GB | 8 | 32GB | No | +| `gpu-h100-sxm` | `1gpu-16vcpu-200gb` | 1 | 80GB | 16 | 200GB | No | +| `gpu-h100-sxm` | `8gpu-128vcpu-1600gb` | 8 | 640GB | 128 | 1600GB | Yes | +| `gpu-h200-sxm` | `1gpu-16vcpu-200gb` | 1 | 141GB | 16 | 200GB | No | +| `gpu-h200-sxm` | `8gpu-128vcpu-1600gb` | 8 | 1128GB | 128 | 1600GB | Yes | + +**Recommendation:** Use `gpu-l40s-a` for development/testing (cheapest option). + +## Required Permissions + +This deployment uses the [Nebius Terraform Provider](https://docs.nebius.com/terraform-provider/quickstart) to provision cloud resources. Your Nebius account needs the following IAM roles to create and manage the required infrastructure. + +### Minimum Required Roles +| Role | Purpose | +|------|---------| +| `compute.admin` | VMs, disks, shared filesystems, GPU clusters | +| `vpc.admin` | VPC networks and subnets | +| `mk8s.admin` | Managed Kubernetes clusters and node groups | +| `storage.admin` | Object Storage buckets | +| `mdb.admin` | Managed PostgreSQL clusters | +| `iam.serviceAccounts.admin` | Service accounts and access keys | +| `container-registry.admin` | Container registries | + +### For WireGuard VPN (Optional) +| Role | Purpose | +|------|---------| +| `vpc.publicIpAllocations.admin` | Allocate public IPs for VPN endpoint | + +For more information, see [Nebius IAM Roles](https://docs.nebius.com/iam/authorization/roles) and the [Terraform Provider Quickstart](https://docs.nebius.com/terraform-provider/quickstart). + +## Security Options + +### Option A: WireGuard VPN (Recommended for Production) + +Enable private-only access with WireGuard VPN: + +```hcl +# In terraform.tfvars +enable_wireguard = true +enable_public_endpoint = false +``` + +After deployment: +```bash +cd deploy/000-prerequisites +./wireguard-client-setup.sh +``` + +### Option B: Public Endpoints + +For development/testing with public access: + +```hcl +# In terraform.tfvars +enable_wireguard = false +enable_public_endpoint = true +``` + +## Cost Optimization Tips + +1. **Use preemptible GPU nodes** for non-critical workloads (up to 70% savings) +2. **Start with single-GPU nodes** for development +3. **Disable unused components** (Loki, multi-GPU support) +4. **Scale down when not in use** + +## Documentation + +- [Terraform Infrastructure](deploy/001-iac/README.md) +- [Setup Scripts](deploy/002-setup/README.md) +- [Troubleshooting Guide](docs/troubleshooting.md) +- [Security Guide](docs/SECURITY.md) + +## License + +Apache License 2.0 - See [LICENSE](LICENSE) for details. + +--- + +## Appendix A: Terraform Configuration Presets + +The `deploy/001-iac/` directory includes several pre-configured `terraform.tfvars` examples for different use cases: + +| Preset | GPU | WireGuard | Public API | Use Case | +|--------|-----|-----------|------------|----------| +| `terraform.tfvars.cost-optimized.example` | L40S | No | Yes | **Recommended for development** - Lowest cost, quick testing | +| `terraform.tfvars.cost-optimized-secure.example` | L40S | Yes | No | Development with VPN-only access | +| `terraform.tfvars.secure.example` | H100 | Yes | No | Staging with full security | +| `terraform.tfvars.production.example` | H200 | Yes | No | Production with maximum performance | +| `terraform.tfvars.example` | H100 | No | Yes | Basic template with all options documented | + +> **Note:** All configurations use **private nodes** (no public IPs on K8s nodes). Access is via WireGuard VPN or public K8s API endpoint. + +### Key Differences + +| Preset | GPU Nodes | CPU Nodes | etcd Size | Preemptible | Security | +|--------|-----------|-----------|-----------|-------------|----------| +| **cost-optimized-secure** | 1x L40S | 2x small | 1 | Yes | VPN only | +| **cost-optimized** | 1x L40S | 2x small | 1 | Yes | Public endpoints | +| **secure** | 8x H100 | 3x medium | 3 | No | VPN only | +| **production** | 4x 8-GPU H200 | 3x large | 3 | No | VPN only | + +**Recommendation:** Start with `terraform.tfvars.cost-optimized-secure.example` for development, then scale up as needed. + +## Cleanup + +To tear down the deployment, see [deploy/README.md](deploy/README.md#cleanup) for detailed instructions. The process involves: + +1. Uninstalling Kubernetes components (in reverse order) via scripts in `deploy/002-setup/cleanup/` +2. Destroying infrastructure with `terraform destroy` in `deploy/001-iac/` diff --git a/applications/osmo/deploy/000-prerequisites/README.md b/applications/osmo/deploy/000-prerequisites/README.md new file mode 100755 index 000000000..459b03d8d --- /dev/null +++ b/applications/osmo/deploy/000-prerequisites/README.md @@ -0,0 +1,291 @@ +# Prerequisites + +This directory contains scripts to set up your environment for deploying OSMO on Nebius. + +## Scripts + +| Script | Purpose | +|--------|---------| +| `install-tools.sh` | Install required tools (Terraform, kubectl, Helm, Nebius CLI, OSMO CLI) | +| `nebius-env-init.sh` | Configure Nebius environment variables | +| `secrets-init.sh` | **NEW** Generate and store secrets in MysteryBox | +| `wireguard-client-setup.sh` | Set up WireGuard VPN client | + +## Quick Start + +### 1. Install Required Tools + +```bash +# Install all required tools +./install-tools.sh + +# Or check what's already installed +./install-tools.sh --check +``` + +### 2. Configure Nebius Environment + +```bash +# Source the script (don't just run it) +source ./nebius-env-init.sh +``` + +This will: +1. Check Nebius CLI installation +2. Verify authentication status +3. Prompt for tenant ID +4. Let you choose to use an existing project OR create a new one +5. Set environment variables for Terraform + +#### Project Options + +When prompted for a project, you can: +- **Option 1**: Enter an existing project ID directly +- **Option 2**: Create a new project (enter a name) +- **Option 3**: List existing projects first, then choose + +Example creating a new project: +``` +Project Configuration + +Options: + 1) Use existing project (enter project ID) + 2) Create new project + 3) List existing projects first + +Choose option [1/2/3]: 2 +Enter new project name: osmo-dev +Creating new project: osmo-dev +[✓] Project created successfully + Project ID: project-abc123xyz +``` + +### 3. Initialize Secrets (Recommended) + +```bash +# Generate secrets and store in MysteryBox +source ./secrets-init.sh +``` + +This creates: +- **PostgreSQL password** - Stored in MysteryBox, NOT in Terraform state +- **MEK (Master Encryption Key)** - For OSMO service authentication + +> **Why?** Storing secrets in MysteryBox keeps them out of Terraform state, providing better security and enabling rotation without re-deploying. + +## Nebius CLI Authentication + +### First-Time Setup + +If you haven't authenticated the Nebius CLI yet: + +```bash +# Create a profile (interactive) +nebius profile create +``` + +The CLI will: +1. Ask for a profile name +2. Open a browser for authentication +3. Ask you to select tenant and project + +### WSL Users + +If the browser doesn't open automatically in WSL: +1. Copy the URL displayed in the terminal +2. Paste it into your Windows browser +3. Complete the authentication +4. Return to the terminal + +### Service Account Authentication + +For CI/CD or automated deployments, use service account authentication: + +1. **Create a service account** in Nebius Console +2. **Create an authorized key** (PEM file) +3. **Configure the CLI**: + ```bash + nebius profile create --auth-type service-account \ + --service-account-id \ + --key-file + ``` + +See [Nebius Service Accounts Documentation](https://docs.nebius.com/iam/service-accounts) for details. + +## Required Permissions + +Your Nebius account needs these permissions: + +### Compute +- `compute.instances.create/delete` - VMs for WireGuard, bastion +- `compute.disks.create/delete` - Boot and data disks +- `compute.filesystems.create/delete` - Shared filesystems + +### Kubernetes +- `mk8s.clusters.create/delete` - Kubernetes clusters +- `mk8s.nodeGroups.create/delete` - Node groups + +### Networking +- `vpc.networks.create/delete` - VPC networks +- `vpc.subnets.create/delete` - Subnets +- `vpc.publicIpAllocations.create/delete` - Public IPs (for WireGuard) + +### Storage +- `storage.buckets.create/delete` - Object storage + +### Database +- `mdb.clusters.create/delete` - Managed PostgreSQL + +### IAM +- `iam.serviceAccounts.create/delete` - Service accounts +- `iam.accessKeys.create/delete` - Access keys for S3 + +### Container Registry +- `container-registry.registries.create/delete` - Container registries + +See [Nebius IAM Roles](https://docs.nebius.com/iam/authorization/roles) for predefined roles. + +## Secrets Management + +### Using MysteryBox (Recommended) + +The `secrets-init.sh` script generates secrets and stores them in Nebius MysteryBox: + +```bash +source ./secrets-init.sh +``` + +This will: +1. Check if secrets already exist in MysteryBox (by name) +2. If not, generate a secure PostgreSQL password (32 chars) and MEK +3. Store new secrets in MysteryBox (Nebius secrets manager) +4. Export `TF_VAR_*` environment variables for Terraform + +### New Terminal Session + +If you start a new terminal session, simply run the script again: + +```bash +source ./secrets-init.sh +``` + +The script will detect existing secrets by name and export their IDs without regenerating them. + +### Retrieving Secrets + +To retrieve secrets from MysteryBox: + +```bash +# PostgreSQL password +nebius mysterybox v1 payload get-by-key \ + --secret-id $OSMO_POSTGRESQL_SECRET_ID \ + --key password \ + --format json | jq -r '.data.string_value' + +# MEK +nebius mysterybox v1 payload get-by-key \ + --secret-id $OSMO_MEK_SECRET_ID \ + --key mek \ + --format json | jq -r '.data.string_value' +``` + +### Security Considerations + +When using MysteryBox secrets: +- Secrets are **NOT** stored in Terraform state +- Only secret IDs are stored in Terraform +- Secrets are fetched at runtime using ephemeral resources +- The password output will be `null` (retrieve via CLI instead) + +### Without MysteryBox + +If you don't run `secrets-init.sh`, Terraform will: +1. Generate a random password for PostgreSQL +2. Store the password in Terraform state (less secure) +3. Output the password via `terraform output -json` + +## Environment Variables + +After running `nebius-env-init.sh`, these variables are set: + +| Variable | Description | +|----------|-------------| +| `NEBIUS_TENANT_ID` | Your Nebius tenant ID | +| `NEBIUS_PROJECT_ID` | Your Nebius project ID | +| `NEBIUS_REGION` | Deployment region (default: eu-north1) | +| `TF_VAR_tenant_id` | Terraform variable for tenant | +| `TF_VAR_parent_id` | Terraform variable for project | +| `TF_VAR_region` | Terraform variable for region | + +After running `secrets-init.sh`, these additional variables are set: + +| Variable | Description | +|----------|-------------| +| `OSMO_POSTGRESQL_SECRET_ID` | MysteryBox secret ID for PostgreSQL password | +| `OSMO_MEK_SECRET_ID` | MysteryBox secret ID for MEK | +| `TF_VAR_postgresql_mysterybox_secret_id` | Terraform variable for PostgreSQL secret | +| `TF_VAR_mek_mysterybox_secret_id` | Terraform variable for MEK secret | + +## WireGuard VPN Setup + +If you enabled WireGuard VPN in your deployment: + +```bash +./wireguard-client-setup.sh +``` + +This will: +1. Check if WireGuard is installed locally +2. Get server information from Terraform outputs +3. Generate client configuration template +4. Provide instructions for completing setup + +### Windows/WSL + +For WSL users, install WireGuard on Windows: +1. Download from https://www.wireguard.com/install/ +2. Import the generated configuration file +3. Connect through the Windows WireGuard app + +## Troubleshooting + +### "Nebius CLI not installed" + +Run the installer: +```bash +./install-tools.sh +``` + +Or install manually: +```bash +curl -sSL https://storage.eu-north1.nebius.cloud/nebius/install.sh | bash +export PATH="$HOME/.nebius/bin:$PATH" +``` + +### "Nebius CLI not authenticated" + +Authenticate with: +```bash +nebius profile create +``` + +### "Permission denied" + +Ensure scripts are executable: +```bash +chmod +x *.sh +``` + +### "Token error" or corrupted token + +Clear the token and re-authenticate: +```bash +unset NEBIUS_IAM_TOKEN +nebius profile create +``` + +### WSL browser doesn't open + +1. Copy the URL from the terminal output +2. Paste into your Windows browser manually +3. Complete authentication and return to terminal diff --git a/applications/osmo/deploy/000-prerequisites/install-tools.sh b/applications/osmo/deploy/000-prerequisites/install-tools.sh new file mode 100755 index 000000000..4c2c6a829 --- /dev/null +++ b/applications/osmo/deploy/000-prerequisites/install-tools.sh @@ -0,0 +1,311 @@ +#!/bin/bash +# +# Install required tools for OSMO on Nebius deployment +# + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +print_status() { + echo -e "${GREEN}[✓]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[!]${NC} $1" +} + +print_error() { + echo -e "${RED}[✗]${NC} $1" +} + +# Version requirements +TERRAFORM_MIN_VERSION="1.5.0" +KUBECTL_MIN_VERSION="1.28.0" +HELM_MIN_VERSION="3.12.0" + +# Detect OS +detect_os() { + if [[ "$OSTYPE" == "linux-gnu"* ]]; then + if grep -q Microsoft /proc/version 2>/dev/null; then + echo "wsl" + else + echo "linux" + fi + elif [[ "$OSTYPE" == "darwin"* ]]; then + echo "macos" + else + echo "unknown" + fi +} + +OS=$(detect_os) + +# Check if command exists (including Nebius in custom locations) +check_command() { + local cmd=$1 + if [[ "$cmd" == "nebius" ]]; then + # Check PATH first, then common installation locations + if command -v nebius &>/dev/null; then + return 0 + elif [[ -x "$HOME/.nebius/bin/nebius" ]]; then + return 0 + fi + return 1 + else + command -v "$cmd" &>/dev/null + fi +} + +# Get Nebius CLI path +get_nebius_path() { + if command -v nebius &>/dev/null; then + command -v nebius + elif [[ -x "$HOME/.nebius/bin/nebius" ]]; then + echo "$HOME/.nebius/bin/nebius" + fi +} + +# Compare versions (returns 0 if version >= required) +version_ge() { + local version=$1 + local required=$2 + printf '%s\n%s' "$required" "$version" | sort -V -C +} + +check_terraform() { + if check_command terraform; then + local version=$(terraform version -json 2>/dev/null | grep -o '"terraform_version": *"[^"]*"' | cut -d'"' -f4) + if [[ -z "$version" ]]; then + version=$(terraform version | head -1 | grep -oE '[0-9]+\.[0-9]+\.[0-9]+') + fi + if version_ge "$version" "$TERRAFORM_MIN_VERSION"; then + print_status "Terraform $version installed" + return 0 + else + print_warning "Terraform $version installed, but >= $TERRAFORM_MIN_VERSION recommended" + return 0 + fi + fi + return 1 +} + +check_kubectl() { + if check_command kubectl; then + # Use --client flag with timeout to prevent hanging + # Some kubectl versions try to contact server even with --client + local version=$(timeout 5 kubectl version --client --short 2>/dev/null | grep -oE 'v[0-9]+\.[0-9]+\.[0-9]+' | head -1 | tr -d 'v') + if [[ -z "$version" ]]; then + # Fallback: try without --short flag + version=$(timeout 5 kubectl version --client 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1) + fi + if [[ -n "$version" ]]; then + print_status "kubectl $version installed" + return 0 + else + # kubectl exists but version check failed - still report as installed + print_status "kubectl installed (version check skipped)" + return 0 + fi + fi + return 1 +} + +check_helm() { + if check_command helm; then + local version=$(helm version --short 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+') + if [[ -n "$version" ]]; then + print_status "Helm $version installed" + return 0 + fi + fi + return 1 +} + +check_nebius() { + if check_command nebius; then + local nebius_path=$(get_nebius_path) + local version=$("$nebius_path" version 2>/dev/null | head -1 || echo "unknown") + print_status "Nebius CLI installed ($version)" + if [[ "$nebius_path" == "$HOME/.nebius/bin/nebius" ]] && ! command -v nebius &>/dev/null; then + print_warning "Nebius CLI not in PATH. Run this first:" + echo "" + echo " export PATH=\"\$HOME/.nebius/bin:\$PATH\"" + echo "" + fi + return 0 + fi + return 1 +} + +check_osmo() { + if check_command osmo; then + local version=$(osmo --version 2>/dev/null | head -1 || echo "unknown") + print_status "OSMO CLI installed ($version)" + return 0 + fi + return 1 +} + +install_terraform() { + echo "Installing Terraform..." + case $OS in + linux|wsl) + wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg + echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list + sudo apt-get update && sudo apt-get install -y terraform + ;; + macos) + brew tap hashicorp/tap + brew install hashicorp/tap/terraform + ;; + esac +} + +install_kubectl() { + echo "Installing kubectl..." + case $OS in + linux|wsl) + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + chmod +x kubectl + sudo mv kubectl /usr/local/bin/ + ;; + macos) + brew install kubectl + ;; + esac +} + +install_helm() { + echo "Installing Helm..." + case $OS in + linux|wsl) + curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + ;; + macos) + brew install helm + ;; + esac +} + +install_nebius() { + echo "Installing Nebius CLI..." + # Note: URL updated per https://docs.nebius.com/cli/install + curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash + + # Add to PATH for current session + export PATH="$HOME/.nebius/bin:$PATH" + + print_warning "Nebius CLI installed to ~/.nebius/bin/" + print_warning "Add to your shell profile: export PATH=\"\$HOME/.nebius/bin:\$PATH\"" +} + +install_osmo() { + echo "Installing OSMO CLI..." + # Install via official NVIDIA install script + # See: https://nvidia.github.io/OSMO/main/user_guide/getting_started/install/index.html + curl -fsSL https://raw.githubusercontent.com/NVIDIA/OSMO/refs/heads/main/install.sh | bash + + # The install script typically adds osmo to ~/.local/bin or similar + if [[ ":$PATH:" != *":$HOME/.local/bin:"* ]]; then + export PATH="$HOME/.local/bin:$PATH" + fi + print_status "OSMO CLI installed" +} + +# Main logic +main() { + echo "========================================" + echo "OSMO on Nebius - Tool Installer" + echo "========================================" + echo "" + echo "Detected OS: $OS" + echo "" + + local check_only=false + if [[ "$1" == "--check" ]]; then + check_only=true + echo "Checking installed tools..." + echo "" + fi + + local all_installed=true + + # Check/Install Terraform + if ! check_terraform; then + all_installed=false + if $check_only; then + print_error "Terraform not installed" + else + install_terraform + check_terraform || print_error "Failed to install Terraform" + fi + fi + + # Check/Install kubectl + if ! check_kubectl; then + all_installed=false + if $check_only; then + print_error "kubectl not installed" + else + install_kubectl + check_kubectl || print_error "Failed to install kubectl" + fi + fi + + # Check/Install Helm + if ! check_helm; then + all_installed=false + if $check_only; then + print_error "Helm not installed" + else + install_helm + check_helm || print_error "Failed to install Helm" + fi + fi + + # Check/Install Nebius CLI + if ! check_nebius; then + all_installed=false + if $check_only; then + print_error "Nebius CLI not installed" + else + install_nebius + check_nebius || print_error "Failed to install Nebius CLI" + fi + fi + + # Check/Install OSMO CLI (for backend deployment and workflow management) + if ! check_osmo; then + all_installed=false + if $check_only; then + print_error "OSMO CLI not installed" + else + install_osmo + check_osmo || print_error "Failed to install OSMO CLI" + fi + fi + + echo "" + if $all_installed; then + echo "========================================" + print_status "All required tools are installed!" + echo "========================================" + echo "" + echo "Next step: Configure your Nebius environment" + echo " source ./nebius-env-init.sh" + else + if $check_only; then + echo "========================================" + print_warning "Some tools are missing. Run without --check to install." + echo "========================================" + exit 1 + fi + fi +} + +main "$@" diff --git a/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh b/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh new file mode 100755 index 000000000..c95a32e37 --- /dev/null +++ b/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh @@ -0,0 +1,418 @@ +#!/bin/bash +# +# Nebius Environment Initialization Script +# +# This script sets up environment variables needed for Terraform deployment. +# Run with: source ./nebius-env-init.sh +# +# NOTE: Do NOT use 'set -e' as this script is meant to be sourced +# + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' + +echo "" +echo "========================================" +echo " Nebius Environment Initialization" +echo "========================================" +echo "" + +# Detect WSL +is_wsl() { + grep -qi microsoft /proc/version 2>/dev/null +} + +# Get Nebius CLI path +get_nebius_path() { + if command -v nebius &>/dev/null; then + command -v nebius + elif [[ -x "$HOME/.nebius/bin/nebius" ]]; then + echo "$HOME/.nebius/bin/nebius" + fi +} + +# Check if Nebius CLI is installed +check_nebius_cli() { + local nebius_path=$(get_nebius_path) + if [[ -z "$nebius_path" ]]; then + echo -e "${RED}[ERROR]${NC} Nebius CLI is not installed." + echo "" + echo "Install it by running: ./install-tools.sh" + echo "Or manually: curl -sSL https://storage.eu-north1.nebius.cloud/nebius/install.sh | bash" + return 1 + fi + + # Add to PATH if needed + if ! command -v nebius &>/dev/null && [[ -x "$HOME/.nebius/bin/nebius" ]]; then + export PATH="$HOME/.nebius/bin:$PATH" + echo -e "${YELLOW}[INFO]${NC} Added ~/.nebius/bin to PATH" + fi + + return 0 +} + +# Check if Nebius CLI is authenticated +check_nebius_auth() { + local nebius_path=$(get_nebius_path) + if [[ -z "$nebius_path" ]]; then + return 1 + fi + + # Clear potentially corrupted token + if [[ -n "$NEBIUS_IAM_TOKEN" ]]; then + echo -e "${YELLOW}[INFO]${NC} Clearing NEBIUS_IAM_TOKEN environment variable" + unset NEBIUS_IAM_TOKEN + fi + + # Test authentication by listing profiles + if "$nebius_path" profile list &>/dev/null; then + return 0 + fi + return 1 +} + +# Interactive prompt with default value +prompt_with_default() { + local prompt=$1 + local default=$2 + local var_name=$3 + local value + + if [[ -n "$default" ]]; then + read -p "$prompt [$default]: " value + value=${value:-$default} + else + read -p "$prompt: " value + fi + + eval "$var_name='$value'" +} + +# List existing projects in a tenant +list_projects() { + local tenant_id=$1 + local nebius_path=$(get_nebius_path) + + echo -e "${CYAN}Fetching existing projects...${NC}" + local projects=$("$nebius_path" iam project list --parent-id "$tenant_id" --format json 2>/dev/null) + + if [[ -z "$projects" || "$projects" == "[]" ]]; then + echo " No projects found in this tenant." + return 1 + fi + + echo "" + echo "Existing projects:" + echo "$projects" | jq -r '.[] | " - \(.metadata.name) (\(.metadata.id))"' 2>/dev/null || echo " (Could not parse projects)" + echo "" + return 0 +} + +# Create a new project +create_project() { + local tenant_id=$1 + local project_name=$2 + local nebius_path=$(get_nebius_path) + + echo -e "${BLUE}Creating new project: $project_name${NC}" + + if "$nebius_path" iam project create --parent-id "$tenant_id" --name "$project_name" 2>&1; then + echo -e "${GREEN}[✓]${NC} Project created successfully" + + # Get the project ID + local project_id=$("$nebius_path" iam project get-by-name --parent-id "$tenant_id" --name "$project_name" --format json 2>/dev/null | jq -r '.metadata.id') + + if [[ -n "$project_id" && "$project_id" != "null" ]]; then + echo " Project ID: $project_id" + echo "$project_id" + return 0 + fi + fi + + echo -e "${RED}[ERROR]${NC} Failed to create project" + return 1 +} + +# Get project ID by name +get_project_id_by_name() { + local tenant_id=$1 + local project_name=$2 + local nebius_path=$(get_nebius_path) + + "$nebius_path" iam project get-by-name --parent-id "$tenant_id" --name "$project_name" --format json 2>/dev/null | jq -r '.metadata.id' +} + +# Interactive project selection/creation +select_or_create_project() { + local tenant_id=$1 + local nebius_path=$(get_nebius_path) + + echo "" + echo -e "${BLUE}Project Configuration${NC}" + echo "" + echo "Options:" + echo " 1) Use existing project (enter project ID)" + echo " 2) Create new project" + echo " 3) List existing projects first" + echo "" + + local choice + read -p "Choose option [1/2/3]: " choice + + case $choice in + 1) + read -p "Enter Project ID: " NEBIUS_PROJECT_ID + ;; + 2) + local project_name + read -p "Enter new project name: " project_name + + if [[ -z "$project_name" ]]; then + echo -e "${RED}[ERROR]${NC} Project name cannot be empty" + return 1 + fi + + # Check if project already exists + local existing_id=$(get_project_id_by_name "$tenant_id" "$project_name") + if [[ -n "$existing_id" && "$existing_id" != "null" ]]; then + echo -e "${YELLOW}[INFO]${NC} Project '$project_name' already exists" + echo " Using existing project ID: $existing_id" + NEBIUS_PROJECT_ID="$existing_id" + else + NEBIUS_PROJECT_ID=$(create_project "$tenant_id" "$project_name") + if [[ $? -ne 0 || -z "$NEBIUS_PROJECT_ID" ]]; then + return 1 + fi + fi + ;; + 3) + list_projects "$tenant_id" + echo "" + read -p "Enter Project ID from the list above (or 'new' to create): " input + + if [[ "$input" == "new" ]]; then + local project_name + read -p "Enter new project name: " project_name + + if [[ -z "$project_name" ]]; then + echo -e "${RED}[ERROR]${NC} Project name cannot be empty" + return 1 + fi + + NEBIUS_PROJECT_ID=$(create_project "$tenant_id" "$project_name") + if [[ $? -ne 0 || -z "$NEBIUS_PROJECT_ID" ]]; then + return 1 + fi + else + NEBIUS_PROJECT_ID="$input" + fi + ;; + *) + echo -e "${RED}[ERROR]${NC} Invalid option" + return 1 + ;; + esac + + return 0 +} + +# Main initialization +main() { + # Step 1: Check Nebius CLI + echo -e "${BLUE}Step 1: Checking Nebius CLI${NC}" + if ! check_nebius_cli; then + return 1 + fi + echo -e "${GREEN}[✓]${NC} Nebius CLI found" + echo "" + + # Step 2: Check authentication + echo -e "${BLUE}Step 2: Checking authentication${NC}" + if ! check_nebius_auth; then + echo -e "${YELLOW}[!]${NC} Nebius CLI not authenticated" + echo "" + echo "Please authenticate with Nebius CLI before running this script." + echo "" + echo "Authentication steps:" + echo " 1. Run: nebius profile create" + echo " 2. Follow the interactive prompts" + echo " 3. Complete browser-based authentication" + if is_wsl; then + echo "" + echo -e "${YELLOW}WSL Note:${NC} If browser doesn't open automatically," + echo " copy the URL from the terminal and paste it in your browser." + fi + echo "" + echo "After authentication, run this script again:" + echo " source ./nebius-env-init.sh" + return 1 + fi + echo -e "${GREEN}[✓]${NC} Nebius CLI authenticated" + echo "" + + # Step 3: Get current profile info + echo -e "${BLUE}Step 3: Retrieving profile information${NC}" + + local nebius_path=$(get_nebius_path) + + # Check for existing environment variables or use defaults + local current_tenant="${NEBIUS_TENANT_ID:-}" + local current_project="${NEBIUS_PROJECT_ID:-}" + local current_region="${NEBIUS_REGION:-eu-north1}" + + # Try to list tenants to help user find their tenant ID + echo "Fetching available tenants..." + local tenants=$("$nebius_path" iam tenant list --format json 2>/dev/null) + if [[ -n "$tenants" && "$tenants" != "[]" ]]; then + echo "" + echo "Available tenants:" + echo "$tenants" | jq -r '.[] | " - \(.metadata.name): \(.metadata.id)"' 2>/dev/null || true + # Auto-detect if only one tenant + local tenant_count=$(echo "$tenants" | jq -r 'length' 2>/dev/null || echo "0") + if [[ "$tenant_count" == "1" && -z "$current_tenant" ]]; then + current_tenant=$(echo "$tenants" | jq -r '.[0].metadata.id' 2>/dev/null) + echo -e "${GREEN}[✓]${NC} Auto-detected tenant: $current_tenant" + fi + fi + + echo "" + + # Step 4: Interactive configuration + echo -e "${BLUE}Step 4: Configure deployment settings${NC}" + echo "" + + # Tenant ID + if [[ -z "$current_tenant" ]]; then + echo "Tenant ID is required. Find it in the Nebius Console under IAM > Tenants" + prompt_with_default "Enter Tenant ID" "" "NEBIUS_TENANT_ID" + else + prompt_with_default "Tenant ID" "$current_tenant" "NEBIUS_TENANT_ID" + fi + + # Validate tenant ID + if [[ -z "$NEBIUS_TENANT_ID" ]]; then + echo -e "${RED}[ERROR]${NC} Tenant ID is required!" + return 1 + fi + + # Project ID - with option to create + if [[ -z "$current_project" ]]; then + echo "" + echo "No project configured. You can use an existing project or create a new one." + if ! select_or_create_project "$NEBIUS_TENANT_ID"; then + return 1 + fi + else + echo "" + echo "Current project: $current_project" + read -p "Use this project? (Y/n/new): " use_current + + case $use_current in + n|N) + if ! select_or_create_project "$NEBIUS_TENANT_ID"; then + return 1 + fi + ;; + new) + local project_name + read -p "Enter new project name: " project_name + NEBIUS_PROJECT_ID=$(create_project "$NEBIUS_TENANT_ID" "$project_name") + if [[ $? -ne 0 || -z "$NEBIUS_PROJECT_ID" ]]; then + return 1 + fi + ;; + *) + NEBIUS_PROJECT_ID="$current_project" + ;; + esac + fi + + # Validate project ID format + if [[ -z "$NEBIUS_PROJECT_ID" ]]; then + echo -e "${RED}[ERROR]${NC} Project ID is required!" + return 1 + fi + + # Check if project ID looks valid (should start with 'project-') + if [[ ! "$NEBIUS_PROJECT_ID" =~ ^project-[a-z0-9]+ ]]; then + echo -e "${RED}[ERROR]${NC} Invalid project ID format: '$NEBIUS_PROJECT_ID'" + echo " Project IDs should look like: project-e00abc123def456" + echo "" + echo " Run this to list your projects:" + echo " nebius iam project list --parent-id $NEBIUS_TENANT_ID" + return 1 + fi + + # Region + echo "" + echo "Available regions:" + echo " - eu-north1 (Finland - H100)" + echo " - eu-west1 (Paris - H200)" + prompt_with_default "Region" "${current_region:-eu-north1}" "NEBIUS_REGION" + + # Step 5: Export environment variables + echo "" + echo -e "${BLUE}Step 5: Setting environment variables${NC}" + + export NEBIUS_TENANT_ID + export NEBIUS_PROJECT_ID + export NEBIUS_REGION + + # Get IAM token for Terraform provider authentication + echo "Getting IAM token for Terraform..." + unset NEBIUS_IAM_TOKEN # Clear any old/corrupted token + export NEBIUS_IAM_TOKEN=$("$nebius_path" iam get-access-token) + + if [[ -z "$NEBIUS_IAM_TOKEN" ]]; then + echo -e "${RED}[ERROR]${NC} Failed to get IAM token" + return 1 + fi + echo -e "${GREEN}[✓]${NC} IAM token obtained" + + # Terraform variables + export TF_VAR_tenant_id="$NEBIUS_TENANT_ID" + export TF_VAR_parent_id="$NEBIUS_PROJECT_ID" + export TF_VAR_region="$NEBIUS_REGION" + + echo "" + echo -e "${GREEN}[✓]${NC} Environment variables set:" + echo " NEBIUS_TENANT_ID = $NEBIUS_TENANT_ID" + echo " NEBIUS_PROJECT_ID = $NEBIUS_PROJECT_ID" + echo " NEBIUS_REGION = $NEBIUS_REGION" + echo " NEBIUS_IAM_TOKEN = ${NEBIUS_IAM_TOKEN:0:20}... (truncated)" + echo " TF_VAR_tenant_id = $TF_VAR_tenant_id" + echo " TF_VAR_parent_id = $TF_VAR_parent_id" + echo " TF_VAR_region = $TF_VAR_region" + + # Step 6: Verify connectivity + echo "" + echo -e "${BLUE}Step 6: Verifying connectivity${NC}" + + if "$nebius_path" iam project get --id "$NEBIUS_PROJECT_ID" &>/dev/null; then + echo -e "${GREEN}[✓]${NC} Successfully connected to Nebius project" + else + echo -e "${YELLOW}[!]${NC} Could not verify project access (this may be normal for new projects)" + fi + + echo "" + echo "========================================" + echo -e "${GREEN}Environment initialization complete!${NC}" + echo "========================================" + echo "" + echo "Next steps:" + echo " 1. source ./secrets-init.sh # Initialize MysteryBox secrets (recommended)" + echo " 2. cd ../001-iac" + echo " 3. cp terraform.tfvars.cost-optimized-secure.example terraform.tfvars" + echo " 4. terraform init && terraform apply" + echo "" + + return 0 +} + +# Run main function +main diff --git a/applications/osmo/deploy/000-prerequisites/secrets-init.sh b/applications/osmo/deploy/000-prerequisites/secrets-init.sh new file mode 100755 index 000000000..bbdec3e19 --- /dev/null +++ b/applications/osmo/deploy/000-prerequisites/secrets-init.sh @@ -0,0 +1,380 @@ +#!/bin/bash +# +# OSMO on Nebius - Secrets Initialization Script +# +# This script generates secrets and stores them in Nebius MysteryBox. +# Secrets are NOT stored in Terraform state - only the secret IDs are used. +# +# Usage: +# source ./secrets-init.sh +# +# Prerequisites: +# - Nebius CLI installed and authenticated +# - Environment variables set (run nebius-env-init.sh first) +# - jq installed +# + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' + +# Default secret names +POSTGRESQL_SECRET_NAME="${OSMO_POSTGRESQL_SECRET_NAME:-osmo-postgresql-password}" +MEK_SECRET_NAME="${OSMO_MEK_SECRET_NAME:-osmo-mek}" + +echo "" +echo "========================================" +echo " OSMO Secrets Initialization" +echo "========================================" +echo "" + +# ----------------------------------------------------------------------------- +# Helper Functions +# ----------------------------------------------------------------------------- + +get_nebius_path() { + if command -v nebius &>/dev/null; then + command -v nebius + elif [[ -x "$HOME/.nebius/bin/nebius" ]]; then + echo "$HOME/.nebius/bin/nebius" + fi +} + +check_prerequisites() { + echo -e "${BLUE}Step 1: Checking prerequisites${NC}" + + # Check Nebius CLI + local nebius_path=$(get_nebius_path) + if [[ -z "$nebius_path" ]]; then + echo -e "${RED}[ERROR]${NC} Nebius CLI not found" + echo " Run: ./install-tools.sh" + return 1 + fi + echo -e "${GREEN}[✓]${NC} Nebius CLI found" + + # Check jq + if ! command -v jq &>/dev/null; then + echo -e "${RED}[ERROR]${NC} jq not found" + echo " Install: sudo apt-get install jq" + return 1 + fi + echo -e "${GREEN}[✓]${NC} jq found" + + # Check openssl + if ! command -v openssl &>/dev/null; then + echo -e "${RED}[ERROR]${NC} openssl not found" + return 1 + fi + echo -e "${GREEN}[✓]${NC} openssl found" + + # Check environment variables + if [[ -z "$NEBIUS_PROJECT_ID" ]]; then + echo -e "${RED}[ERROR]${NC} NEBIUS_PROJECT_ID not set" + echo " Run: source ./nebius-env-init.sh" + return 1 + fi + echo -e "${GREEN}[✓]${NC} NEBIUS_PROJECT_ID set: $NEBIUS_PROJECT_ID" + + echo "" + return 0 +} + +# Generate secure password meeting Nebius PostgreSQL requirements +generate_postgresql_password() { + # Requirements: + # - Min 8 characters (we use 32) + # - At least one lowercase, uppercase, digit, special char + # - No % character + + local password="" + local attempts=0 + local max_attempts=10 + + while [[ $attempts -lt $max_attempts ]]; do + # Generate base password + password=$(openssl rand -base64 32 | tr -d '/+=\n' | head -c 28) + + # Add required character types + local lower=$(echo "abcdefghijklmnopqrstuvwxyz" | fold -w1 | shuf | head -1) + local upper=$(echo "ABCDEFGHIJKLMNOPQRSTUVWXYZ" | fold -w1 | shuf | head -1) + local digit=$(echo "0123456789" | fold -w1 | shuf | head -1) + local special=$(echo '!#$^&*()-_=+' | fold -w1 | shuf | head -1) + + password="${password}${lower}${upper}${digit}${special}" + + # Shuffle the password + password=$(echo "$password" | fold -w1 | shuf | tr -d '\n') + + # Verify requirements + if [[ ${#password} -ge 32 ]] && \ + [[ "$password" =~ [a-z] ]] && \ + [[ "$password" =~ [A-Z] ]] && \ + [[ "$password" =~ [0-9] ]] && \ + [[ "$password" =~ [\!\#\$\^\&\*\(\)\-\_\=\+] ]] && \ + [[ ! "$password" =~ [%@:\/\;\[\]\{\}\|\<\>\,\.\?] ]]; then + echo "$password" + return 0 + fi + + ((attempts++)) + done + + echo -e "${RED}[ERROR]${NC} Failed to generate valid password after $max_attempts attempts" + return 1 +} + +# Generate MEK (Master Encryption Key) for OSMO +generate_mek() { + # MEK is a JWK (JSON Web Key) format + # OSMO expects: {"currentMek": "key1", "meks": {"key1": ""}} + + # Generate a 256-bit key + local key_bytes=$(openssl rand 32) + local key_base64=$(echo -n "$key_bytes" | base64 | tr -d '\n') + + # Create JWK structure (symmetric key) + local jwk=$(cat </dev/null) + + # Extract JSON from output (CLI may print info messages before JSON) + local json_result=$(echo "$result" | awk '/^{/,0') + + if [[ -n "$json_result" && "$json_result" != "null" ]]; then + echo "$json_result" | jq -r '.metadata.id' + return 0 + fi + return 1 +} + +# Create secret in MysteryBox +create_secret() { + local parent_id=$1 + local secret_name=$2 + local key=$3 + local value=$4 + local nebius_path=$(get_nebius_path) + + # Escape special characters in value for JSON + local escaped_value=$(echo -n "$value" | jq -Rs '.') + # Remove surrounding quotes added by jq + escaped_value=${escaped_value:1:-1} + + local payload="[{\"key\":\"$key\",\"string_value\":\"$escaped_value\"}]" + + local result=$("$nebius_path" mysterybox v1 secret create \ + --parent-id "$parent_id" \ + --name "$secret_name" \ + --secret-version-payload "$payload" \ + --format json 2>&1) + + local exit_code=$? + + # Extract JSON from output (CLI may print info messages before JSON) + # Find the first line starting with '{' and print everything from there + local json_result=$(echo "$result" | awk '/^{/,0') + + if [[ $exit_code -eq 0 && -n "$json_result" ]]; then + echo "$json_result" | jq -r '.metadata.id' + return 0 + else + echo -e "${RED}[ERROR]${NC} Failed to create secret: $result" + return 1 + fi +} + +# Delete secret from MysteryBox +delete_secret() { + local secret_id=$1 + local nebius_path=$(get_nebius_path) + + "$nebius_path" mysterybox v1 secret delete --id "$secret_id" 2>/dev/null +} + +# ----------------------------------------------------------------------------- +# Main Secret Creation Functions +# ----------------------------------------------------------------------------- + +create_postgresql_secret() { + echo -e "${BLUE}Creating PostgreSQL password secret...${NC}" + + # Check if secret already exists + local existing_id=$(secret_exists "$NEBIUS_PROJECT_ID" "$POSTGRESQL_SECRET_NAME") + + if [[ -n "$existing_id" ]]; then + echo -e "${YELLOW}[!]${NC} Secret '$POSTGRESQL_SECRET_NAME' already exists (ID: $existing_id)" + read -p " Replace existing secret? (y/N): " replace + if [[ "$replace" =~ ^[Yy]$ ]]; then + echo " Deleting existing secret..." + delete_secret "$existing_id" + sleep 2 + else + echo " Using existing secret" + export OSMO_POSTGRESQL_SECRET_ID="$existing_id" + export TF_VAR_postgresql_mysterybox_secret_id="$existing_id" + return 0 + fi + fi + + # Generate password + echo " Generating secure password..." + local password=$(generate_postgresql_password) + if [[ $? -ne 0 || -z "$password" ]]; then + echo -e "${RED}[ERROR]${NC} Failed to generate password" + return 1 + fi + echo -e "${GREEN}[✓]${NC} Password generated (length: ${#password})" + + # Store in MysteryBox + echo " Storing in MysteryBox..." + local secret_id=$(create_secret "$NEBIUS_PROJECT_ID" "$POSTGRESQL_SECRET_NAME" "password" "$password") + + if [[ $? -eq 0 && -n "$secret_id" ]]; then + echo -e "${GREEN}[✓]${NC} PostgreSQL secret created: $secret_id" + export OSMO_POSTGRESQL_SECRET_ID="$secret_id" + export TF_VAR_postgresql_mysterybox_secret_id="$secret_id" + return 0 + else + echo -e "${RED}[ERROR]${NC} Failed to create PostgreSQL secret" + return 1 + fi +} + +create_mek_secret() { + echo -e "${BLUE}Creating MEK (Master Encryption Key) secret...${NC}" + + # Check if secret already exists + local existing_id=$(secret_exists "$NEBIUS_PROJECT_ID" "$MEK_SECRET_NAME") + + if [[ -n "$existing_id" ]]; then + echo -e "${YELLOW}[!]${NC} Secret '$MEK_SECRET_NAME' already exists (ID: $existing_id)" + read -p " Replace existing secret? (y/N): " replace + if [[ "$replace" =~ ^[Yy]$ ]]; then + echo " Deleting existing secret..." + delete_secret "$existing_id" + sleep 2 + else + echo " Using existing secret" + export OSMO_MEK_SECRET_ID="$existing_id" + export TF_VAR_mek_mysterybox_secret_id="$existing_id" + return 0 + fi + fi + + # Generate MEK + echo " Generating Master Encryption Key..." + local mek=$(generate_mek) + if [[ $? -ne 0 || -z "$mek" ]]; then + echo -e "${RED}[ERROR]${NC} Failed to generate MEK" + return 1 + fi + echo -e "${GREEN}[✓]${NC} MEK generated" + + # Store in MysteryBox + echo " Storing in MysteryBox..." + local secret_id=$(create_secret "$NEBIUS_PROJECT_ID" "$MEK_SECRET_NAME" "mek" "$mek") + + if [[ $? -eq 0 && -n "$secret_id" ]]; then + echo -e "${GREEN}[✓]${NC} MEK secret created: $secret_id" + export OSMO_MEK_SECRET_ID="$secret_id" + export TF_VAR_mek_mysterybox_secret_id="$secret_id" + return 0 + else + echo -e "${RED}[ERROR]${NC} Failed to create MEK secret" + return 1 + fi +} + + +# ----------------------------------------------------------------------------- +# Main +# ----------------------------------------------------------------------------- + +main() { + # Check prerequisites + if ! check_prerequisites; then + return 1 + fi + + echo -e "${BLUE}Step 2: Creating secrets in MysteryBox${NC}" + echo "" + + local success=true + + # Create PostgreSQL secret + if ! create_postgresql_secret; then + success=false + fi + echo "" + + # Create MEK secret + if ! create_mek_secret; then + success=false + fi + echo "" + + if ! $success; then + echo -e "${RED}[ERROR]${NC} Some secrets failed to create" + return 1 + fi + + # Summary + echo "========================================" + echo -e "${GREEN}Secrets initialization complete!${NC}" + echo "========================================" + echo "" + echo "Environment variables exported:" + echo " TF_VAR_postgresql_mysterybox_secret_id = $TF_VAR_postgresql_mysterybox_secret_id" + echo " TF_VAR_mek_mysterybox_secret_id = $TF_VAR_mek_mysterybox_secret_id" + echo "" + echo "Secrets are stored in MysteryBox. Run this script again in a new" + echo "terminal session to retrieve existing secrets by name." + echo "" + echo "To retrieve secret values manually:" + echo " # PostgreSQL password:" + echo " nebius mysterybox v1 payload get-by-key --secret-id $TF_VAR_postgresql_mysterybox_secret_id --key password --format json | jq -r '.data.string_value'" + echo "" + echo " # MEK:" + echo " nebius mysterybox v1 payload get-by-key --secret-id $TF_VAR_mek_mysterybox_secret_id --key mek --format json | jq -r '.data.string_value'" + echo "" + echo "Next steps:" + echo " 1. cd ../001-iac" + echo " 2. cp terraform.tfvars.cost-optimized.example terraform.tfvars # or another preset" + echo " 3. terraform init && terraform apply" + echo "" + + return 0 +} + +# Run main +main diff --git a/applications/osmo/deploy/000-prerequisites/wireguard-client-setup.sh b/applications/osmo/deploy/000-prerequisites/wireguard-client-setup.sh new file mode 100755 index 000000000..25f0bbe48 --- /dev/null +++ b/applications/osmo/deploy/000-prerequisites/wireguard-client-setup.sh @@ -0,0 +1,216 @@ +#!/bin/bash +# +# WireGuard Client Setup Script +# +# This script helps configure a WireGuard client to connect to the +# OSMO cluster's private network. +# + +set -e + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +echo "" +echo "========================================" +echo " WireGuard Client Setup" +echo "========================================" +echo "" + +# Detect OS +detect_os() { + if [[ "$OSTYPE" == "linux-gnu"* ]]; then + if grep -q Microsoft /proc/version 2>/dev/null; then + echo "wsl" + else + echo "linux" + fi + elif [[ "$OSTYPE" == "darwin"* ]]; then + echo "macos" + elif [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]]; then + echo "windows" + else + echo "unknown" + fi +} + +OS=$(detect_os) +echo "Detected OS: $OS" +echo "" + +# Check if WireGuard is installed +check_wireguard() { + case $OS in + linux|wsl) + if command -v wg &>/dev/null; then + return 0 + fi + ;; + macos) + if [[ -d "/Applications/WireGuard.app" ]] || command -v wg &>/dev/null; then + return 0 + fi + ;; + windows) + if [[ -d "/c/Program Files/WireGuard" ]]; then + return 0 + fi + ;; + esac + return 1 +} + +# Install WireGuard +install_wireguard() { + echo -e "${BLUE}Installing WireGuard...${NC}" + case $OS in + linux) + sudo apt-get update + sudo apt-get install -y wireguard wireguard-tools + ;; + wsl) + echo -e "${YELLOW}For WSL, install WireGuard on Windows:${NC}" + echo " 1. Download from: https://www.wireguard.com/install/" + echo " 2. Install the Windows application" + echo " 3. Import the configuration file" + return 1 + ;; + macos) + echo "Install WireGuard from the App Store or:" + echo " brew install wireguard-tools" + return 1 + ;; + *) + echo "Please install WireGuard manually from: https://www.wireguard.com/install/" + return 1 + ;; + esac +} + +# Get Terraform outputs +get_terraform_output() { + local output_name=$1 + cd ../001-iac 2>/dev/null || { + echo -e "${RED}[ERROR]${NC} Cannot find ../001-iac directory" + return 1 + } + terraform output -raw "$output_name" 2>/dev/null + cd - >/dev/null +} + +# Generate client keys +generate_client_keys() { + local private_key=$(wg genkey) + local public_key=$(echo "$private_key" | wg pubkey) + echo "$private_key|$public_key" +} + +# Main setup +main() { + # Check/Install WireGuard + if ! check_wireguard; then + install_wireguard || { + echo "" + echo -e "${RED}[ERROR]${NC} WireGuard not installed. Please install manually." + exit 1 + } + fi + echo -e "${GREEN}[✓]${NC} WireGuard installed" + echo "" + + # Check if WireGuard was enabled in Terraform + echo -e "${BLUE}Retrieving WireGuard server information...${NC}" + + local wg_public_ip=$(get_terraform_output "wireguard.public_ip" 2>/dev/null || echo "") + + if [[ -z "$wg_public_ip" || "$wg_public_ip" == "null" ]]; then + echo -e "${RED}[ERROR]${NC} WireGuard VPN was not deployed." + echo "" + echo "To enable WireGuard, set in terraform.tfvars:" + echo " enable_wireguard = true" + echo "" + echo "Then run: terraform apply" + exit 1 + fi + + local wg_ui_url=$(get_terraform_output "wireguard.ui_url" 2>/dev/null || echo "") + + echo "" + echo -e "${GREEN}[✓]${NC} WireGuard server found" + echo " Public IP: $wg_public_ip" + if [[ -n "$wg_ui_url" && "$wg_ui_url" != "null" ]]; then + echo " Web UI: $wg_ui_url" + fi + echo "" + + # Instructions for manual configuration + echo "========================================" + echo -e "${BLUE}Configuration Instructions${NC}" + echo "========================================" + echo "" + echo "Option 1: Use WireGuard Web UI (Recommended)" + echo " 1. Open in browser: $wg_ui_url" + echo " 2. Login with the generated password (check Terraform output)" + echo " 3. Create a new client configuration" + echo " 4. Download the configuration file" + echo " 5. Import into WireGuard client" + echo "" + echo "Option 2: Manual Configuration" + echo " 1. Generate client keys: wg genkey | tee privatekey | wg pubkey > publickey" + echo " 2. SSH to WireGuard server and add peer" + echo " 3. Create local configuration file" + echo "" + + # Create config template + local config_file="wg-client-osmo.conf" + + if [[ "$OS" == "linux" ]] && command -v wg &>/dev/null; then + echo -e "${BLUE}Generating client configuration template...${NC}" + + local keys=$(generate_client_keys) + local client_private_key=$(echo "$keys" | cut -d'|' -f1) + local client_public_key=$(echo "$keys" | cut -d'|' -f2) + + cat > "$config_file" << EOF +[Interface] +# Client private key (generated) +PrivateKey = $client_private_key +# Client IP address in VPN network (change if needed) +Address = 10.8.0.2/24 +DNS = 8.8.8.8 + +[Peer] +# WireGuard server public key (get from server) +PublicKey = +# Allowed IPs - route all traffic through VPN +AllowedIPs = 10.8.0.0/24, 10.0.0.0/16 +# WireGuard server endpoint +Endpoint = $wg_public_ip:51820 +# Keep connection alive +PersistentKeepalive = 25 +EOF + + echo "" + echo -e "${GREEN}[✓]${NC} Configuration template created: $config_file" + echo "" + echo "Your client public key (add this to server):" + echo " $client_public_key" + echo "" + echo "Next steps:" + echo " 1. Get server public key from WireGuard Web UI or server" + echo " 2. Add your client public key to server" + echo " 3. Update in $config_file" + echo " 4. Start VPN: sudo wg-quick up ./$config_file" + fi + + echo "" + echo "========================================" + echo -e "${GREEN}Setup guide complete!${NC}" + echo "========================================" +} + +main "$@" diff --git a/applications/osmo/deploy/001-iac/README.md b/applications/osmo/deploy/001-iac/README.md new file mode 100755 index 000000000..6f2ae927d --- /dev/null +++ b/applications/osmo/deploy/001-iac/README.md @@ -0,0 +1,245 @@ +# Infrastructure as Code (Terraform) + +This directory contains Terraform configurations for deploying OSMO infrastructure on Nebius. + +## Prerequisites + +1. Install required tools: + ```bash + cd ../000-prerequisites + ./install-tools.sh + ``` + +2. Configure Nebius environment: + ```bash + source ../000-prerequisites/nebius-env-init.sh + ``` + +3. **(Recommended)** Initialize secrets in MysteryBox: + ```bash + source ../000-prerequisites/secrets-init.sh + ``` + This generates secure passwords/keys and stores them in MysteryBox, keeping them OUT of Terraform state. + +## Quick Start + +```bash +# Recommended: Cost-optimized with secure private access +cp terraform.tfvars.cost-optimized-secure.example terraform.tfvars + +# Edit if needed (tenant_id and parent_id set via environment) +vim terraform.tfvars + +# Deploy +terraform init +terraform plan +terraform apply +``` + +## Configuration Tiers + +| File | Use Case | GPU | Security | Est. Cost/6h | +|------|----------|-----|----------|--------------| +| `terraform.tfvars.cost-optimized-secure.example` (recommended) | Dev | 1x L40S | WireGuard | **~$15-25** | +| `terraform.tfvars.cost-optimized.example` | Dev (cheapest) | 1x L40S | Public | ~$10-15 | +| `terraform.tfvars.secure.example` | Staging | 8x H100 | WireGuard | ~$300-400 | +| `terraform.tfvars.production.example` | Production | 32x H200 | WireGuard | ~$1000+ | + +## Resources Created + +### Network +- VPC Network +- Subnet with configurable CIDR + +### Kubernetes +- Managed Kubernetes Cluster (MK8s) +- CPU Node Group (for system workloads) +- GPU Node Group(s) (for training) +- Service Account for node groups + +### Storage +- Object Storage Bucket (S3-compatible) +- Shared Filesystem (Filestore) +- Service Account with access keys + +### Database +- Managed PostgreSQL Cluster + +### Container Registry +- Nebius Container Registry (when `enable_container_registry = true`) + +### Optional +- WireGuard VPN Server (when `enable_wireguard = true`) +- GPU Cluster for InfiniBand (when `enable_gpu_cluster = true`) + +## Module Structure + +``` +001-iac/ +├── main.tf # Root module +├── variables.tf # Input variables +├── outputs.tf # Output values +├── locals.tf # Local values +├── versions.tf # Provider versions +├── terraform.tfvars.*.example +└── modules/ + ├── platform/ # VPC, Storage, DB, Container Registry + ├── k8s/ # Kubernetes cluster + └── wireguard/ # VPN server +``` + +## GPU Options + +### Available Platforms (eu-north1) + +| Platform | GPU | VRAM | ~Cost/hr | Best For | +|----------|-----|------|----------|----------| +| `gpu-l40s-a` | L40S Intel | 48GB | **$1.55** | Dev/Testing (cheapest) | +| `gpu-l40s-d` | L40S AMD | 48GB | **$1.55** | Dev/Testing | +| `gpu-h100-sxm` | H100 | 80GB | ~$4-5 | Training | +| `gpu-h200-sxm` | H200 | 141GB | ~$5-6 | Large models | + +### Presets + +| Platform | Preset | GPUs | vCPUs | RAM | +|----------|--------|------|-------|-----| +| L40S | `1gpu-8vcpu-32gb` | 1 | 8 | 32GB | +| L40S | `2gpu-16vcpu-64gb` | 2 | 16 | 64GB | +| H100/H200 | `1gpu-16vcpu-200gb` | 1 | 16 | 200GB | +| H100/H200 | `8gpu-128vcpu-1600gb` | 8 | 128 | 1600GB | + +## Security Options + +### Public Access (Default) + +```hcl +enable_public_endpoint = true +cpu_nodes_assign_public_ip = true +enable_wireguard = false +``` + +### Private Access (WireGuard) + +```hcl +enable_public_endpoint = false +cpu_nodes_assign_public_ip = false +gpu_nodes_assign_public_ip = false +enable_wireguard = true +``` + +After deployment, set up VPN client: +```bash +cd ../000-prerequisites +./wireguard-client-setup.sh +``` + +## Cost Optimization + +### Use Preemptible GPUs +```hcl +gpu_nodes_preemptible = true # Up to 70% savings +``` + +### Use Single-GPU Nodes for Dev +```hcl +gpu_nodes_preset = "1gpu-16vcpu-200gb" +enable_gpu_cluster = false +``` + +### Minimize Storage +```hcl +filestore_size_gib = 256 +postgresql_disk_size_gib = 20 +``` + +## Secrets Management (MysteryBox) + +This module supports two approaches for secrets: + +### Option A: MysteryBox (Recommended) +Secrets are stored in Nebius MysteryBox and read at runtime. **Not stored in Terraform state.** + +```bash +# Before terraform apply: +cd ../000-prerequisites +source ./secrets-init.sh # Creates secrets in MysteryBox +cd ../001-iac +terraform apply # Uses TF_VAR_* env vars set by script +``` + +**Benefits:** +- Secrets never in Terraform state file +- Centralized secret management +- Easier rotation without re-deploying +- Better audit trail + +**Retrieving Secrets:** +```bash +# PostgreSQL password +nebius mysterybox v1 payload get-by-key \ + --secret-id $OSMO_POSTGRESQL_SECRET_ID \ + --key password \ + --format json | jq -r '.data.string_value' + +# MEK +nebius mysterybox v1 payload get-by-key \ + --secret-id $OSMO_MEK_SECRET_ID \ + --key mek \ + --format json | jq -r '.data.string_value' +``` + +### Option B: Terraform-Generated (Fallback) +If MysteryBox secret IDs are not set, Terraform generates secrets automatically. + +```hcl +# Secrets stored in Terraform state (less secure) +postgresql_mysterybox_secret_id = null # Default +mek_mysterybox_secret_id = null # Default +``` + +**Retrieving Secrets:** +```bash +terraform output -json postgresql_password +``` + +### MysteryBox Variables + +| Variable | Description | +|----------|-------------| +| `postgresql_mysterybox_secret_id` | Secret ID for PostgreSQL password | +| `mek_mysterybox_secret_id` | Secret ID for OSMO MEK | + +## Outputs + +After `terraform apply`, you'll see: + +- `cluster_id` - Kubernetes cluster ID +- `cluster_endpoint` - Kubernetes API endpoint +- `storage_bucket` - Object storage details +- `container_registry` - Container Registry details (endpoint, name) +- `postgresql` - Database connection info +- `wireguard` - VPN details (if enabled) +- `next_steps` - Instructions for next deployment phase + +## Cleanup + +```bash +terraform destroy +``` + +**Warning**: This will delete all resources including data in PostgreSQL and Object Storage. + +## Troubleshooting + +### Authentication Error +```bash +source ../000-prerequisites/nebius-env-init.sh +``` + +### Resource Quota Exceeded +Check your Nebius quota in the console and request increases if needed. + +### Invalid GPU Platform +Verify the platform is available in your region: +- `eu-north1`: H100 +- `eu-west1`: H200 diff --git a/applications/osmo/deploy/001-iac/locals.tf b/applications/osmo/deploy/001-iac/locals.tf new file mode 100755 index 000000000..579adee1c --- /dev/null +++ b/applications/osmo/deploy/001-iac/locals.tf @@ -0,0 +1,77 @@ +# ============================================================================= +# Local Values +# ============================================================================= + +locals { + # Resource naming + name_prefix = "${var.project_name}-${var.environment}" + + # SSH key handling + ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( + fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null + ) + + # Region-specific defaults + region_defaults = { + eu-north1 = { + gpu_nodes_platform = "gpu-h100-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "fabric-3" + } + eu-west1 = { + gpu_nodes_platform = "gpu-h200-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "fabric-5" + } + eu-north2 = { + gpu_nodes_platform = "gpu-h200-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "eu-north2-a" + } + us-central1 = { + gpu_nodes_platform = "gpu-h200-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "us-central1-a" + } + } + + # Available GPU platforms by region (for reference) + # eu-north1: + # - gpu-h200-sxm (H200, 141GB VRAM) - high-end + # - gpu-h100-sxm (H100, 80GB VRAM) - high-end + # - gpu-l40s-a (L40S Intel, 48GB VRAM) - cost-effective + # - gpu-l40s-d (L40S AMD, 48GB VRAM) - cost-effective + # + # L40S presets: 1gpu-8vcpu-32gb, 2gpu-16vcpu-64gb (verify in console) + # H100/H200 presets: 1gpu-16vcpu-200gb, 8gpu-128vcpu-1600gb + + # Current region config with overrides + current_region = local.region_defaults[var.region] + + gpu_nodes_platform = coalesce(var.gpu_nodes_platform, local.current_region.gpu_nodes_platform) + gpu_nodes_preset = coalesce(var.gpu_nodes_preset, local.current_region.gpu_nodes_preset) + infiniband_fabric = coalesce(var.infiniband_fabric, local.current_region.infiniband_fabric) + + # Generate unique storage bucket name if not provided + storage_bucket_name = var.storage_bucket_name != "" ? var.storage_bucket_name : "${local.name_prefix}-storage-${random_string.suffix.result}" + + # Common tags/labels + common_labels = { + project = var.project_name + environment = var.environment + managed_by = "terraform" + } +} + +# Random suffix for unique naming +resource "random_string" "suffix" { + length = 8 + lower = true + upper = false + numeric = true + special = false + + keepers = { + project_id = var.parent_id + } +} diff --git a/applications/osmo/deploy/001-iac/main.tf b/applications/osmo/deploy/001-iac/main.tf new file mode 100755 index 000000000..af91f9e0c --- /dev/null +++ b/applications/osmo/deploy/001-iac/main.tf @@ -0,0 +1,134 @@ +# ============================================================================= +# OSMO on Nebius - Root Module +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Platform Module (VPC, Storage, PostgreSQL, Container Registry) +# ----------------------------------------------------------------------------- +module "platform" { + source = "./modules/platform" + + parent_id = var.parent_id + tenant_id = var.tenant_id + region = var.region + name_prefix = local.name_prefix + + # Network + vpc_cidr = var.vpc_cidr + + # Storage + storage_bucket_name = local.storage_bucket_name + + # Filestore + enable_filestore = var.enable_filestore + filestore_disk_type = var.filestore_disk_type + filestore_size_gib = var.filestore_size_gib + filestore_block_size_kib = var.filestore_block_size_kib + + # PostgreSQL (optional - can use in-cluster PostgreSQL instead) + enable_managed_postgresql = var.enable_managed_postgresql + postgresql_version = var.postgresql_version + postgresql_public_access = var.postgresql_public_access + postgresql_platform = var.postgresql_platform + postgresql_preset = var.postgresql_preset + postgresql_disk_type = var.postgresql_disk_type + postgresql_disk_size_gib = var.postgresql_disk_size_gib + postgresql_host_count = var.postgresql_host_count + postgresql_database_name = var.postgresql_database_name + postgresql_username = var.postgresql_username + + # Container Registry + enable_container_registry = var.enable_container_registry + container_registry_name = var.container_registry_name + + # MysteryBox secrets (optional - more secure, keeps secrets out of TF state) + postgresql_mysterybox_secret_id = var.postgresql_mysterybox_secret_id + mek_mysterybox_secret_id = var.mek_mysterybox_secret_id +} + +# ----------------------------------------------------------------------------- +# Kubernetes Module +# ----------------------------------------------------------------------------- +module "k8s" { + source = "./modules/k8s" + + parent_id = var.parent_id + tenant_id = var.tenant_id + region = var.region + name_prefix = local.name_prefix + + # Network + subnet_id = module.platform.subnet_id + + # Cluster config + k8s_version = var.k8s_version + etcd_cluster_size = var.etcd_cluster_size + enable_public_endpoint = var.enable_public_endpoint + + # SSH + ssh_user_name = var.ssh_user_name + ssh_public_key = local.ssh_public_key + + # CPU nodes + cpu_nodes_count = var.cpu_nodes_count + cpu_nodes_platform = var.cpu_nodes_platform + cpu_nodes_preset = var.cpu_nodes_preset + cpu_disk_type = var.cpu_disk_type + cpu_disk_size_gib = var.cpu_disk_size_gib + cpu_nodes_assign_public_ip = var.cpu_nodes_assign_public_ip + + # GPU nodes + gpu_nodes_count_per_group = var.gpu_nodes_count_per_group + gpu_node_groups = var.gpu_node_groups + gpu_nodes_platform = local.gpu_nodes_platform + gpu_nodes_preset = local.gpu_nodes_preset + gpu_disk_type = var.gpu_disk_type + gpu_disk_size_gib = var.gpu_disk_size_gib + gpu_nodes_assign_public_ip = var.gpu_nodes_assign_public_ip + enable_gpu_cluster = var.enable_gpu_cluster + infiniband_fabric = local.infiniband_fabric + enable_gpu_taints = var.enable_gpu_taints + gpu_nodes_preemptible = var.gpu_nodes_preemptible + + # Filestore + enable_filestore = var.enable_filestore + filestore_id = var.enable_filestore ? module.platform.filestore_id : null + + # Note: No explicit depends_on needed - Terraform infers dependencies from: + # - subnet_id (waits for subnet) + # - filestore_id (waits for filestore if enabled) + # This allows k8s to start as soon as subnet/filestore are ready, + # without waiting for PostgreSQL (which takes 5-15 min) +} + +# ----------------------------------------------------------------------------- +# WireGuard VPN Module (Optional) +# ----------------------------------------------------------------------------- +module "wireguard" { + count = var.enable_wireguard ? 1 : 0 + source = "./modules/wireguard" + + parent_id = var.parent_id + region = var.region + name_prefix = local.name_prefix + + # Network + subnet_id = module.platform.subnet_id + vpc_cidr = var.vpc_cidr + wg_network = var.wireguard_network + + # Instance config + platform = var.wireguard_platform + preset = var.wireguard_preset + disk_size_gib = var.wireguard_disk_size_gib + + # WireGuard config + wg_port = var.wireguard_port + ui_port = var.wireguard_ui_port + + # SSH + ssh_user_name = var.ssh_user_name + ssh_public_key = local.ssh_public_key + + # Note: No explicit depends_on needed - Terraform infers from subnet_id +} diff --git a/applications/osmo/deploy/001-iac/modules/k8s/main.tf b/applications/osmo/deploy/001-iac/modules/k8s/main.tf new file mode 100755 index 000000000..b5d6bc07b --- /dev/null +++ b/applications/osmo/deploy/001-iac/modules/k8s/main.tf @@ -0,0 +1,176 @@ +# ============================================================================= +# Kubernetes Module +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Service Account for Node Groups +# ----------------------------------------------------------------------------- +data "nebius_iam_v1_group" "editors" { + name = "editors" + parent_id = var.tenant_id +} + +resource "nebius_iam_v1_service_account" "k8s_nodes" { + parent_id = var.parent_id + name = "${var.name_prefix}-k8s-nodes-sa" +} + +resource "nebius_iam_v1_group_membership" "k8s_nodes" { + parent_id = data.nebius_iam_v1_group.editors.id + member_id = nebius_iam_v1_service_account.k8s_nodes.id +} + +# ----------------------------------------------------------------------------- +# GPU Cluster (InfiniBand) +# ----------------------------------------------------------------------------- +resource "nebius_compute_v1_gpu_cluster" "main" { + count = var.enable_gpu_cluster && var.gpu_nodes_count_per_group > 0 ? 1 : 0 + + parent_id = var.parent_id + name = "${var.name_prefix}-gpu-cluster" + infiniband_fabric = var.infiniband_fabric +} + +# ----------------------------------------------------------------------------- +# Managed Kubernetes Cluster +# ----------------------------------------------------------------------------- +resource "nebius_mk8s_v1_cluster" "main" { + parent_id = var.parent_id + name = "${var.name_prefix}-cluster" + + control_plane = { + subnet_id = var.subnet_id + version = var.k8s_version + etcd_cluster_size = var.etcd_cluster_size + + endpoints = var.enable_public_endpoint ? { + public_endpoint = {} + } : {} + } + + lifecycle { + ignore_changes = [labels] + } +} + +# ----------------------------------------------------------------------------- +# CPU Node Group +# ----------------------------------------------------------------------------- +resource "nebius_mk8s_v1_node_group" "cpu" { + parent_id = nebius_mk8s_v1_cluster.main.id + name = "${var.name_prefix}-cpu-nodes" + fixed_node_count = var.cpu_nodes_count + version = var.k8s_version + + labels = { + "node-type" = "cpu" + } + + template = { + boot_disk = { + size_gibibytes = var.cpu_disk_size_gib + type = var.cpu_disk_type + } + + service_account_id = nebius_iam_v1_service_account.k8s_nodes.id + + network_interfaces = [ + { + subnet_id = var.subnet_id + public_ip_address = var.cpu_nodes_assign_public_ip ? {} : null + } + ] + + resources = { + platform = var.cpu_nodes_platform + preset = var.cpu_nodes_preset + } + + filesystems = var.enable_filestore && var.filestore_id != null ? [ + { + attach_mode = "READ_WRITE" + mount_tag = "data" + existing_filesystem = { + id = var.filestore_id + } + } + ] : null + + cloud_init_user_data = templatefile("${path.module}/templates/cloud-init.yaml", { + ssh_user_name = var.ssh_user_name + ssh_public_key = var.ssh_public_key + enable_filestore = var.enable_filestore + }) + } +} + +# ----------------------------------------------------------------------------- +# GPU Node Groups +# ----------------------------------------------------------------------------- +resource "nebius_mk8s_v1_node_group" "gpu" { + count = var.gpu_nodes_count_per_group > 0 ? var.gpu_node_groups : 0 + + parent_id = nebius_mk8s_v1_cluster.main.id + name = "${var.name_prefix}-gpu-nodes-${count.index}" + fixed_node_count = var.gpu_nodes_count_per_group + version = var.k8s_version + + labels = { + "node-type" = "gpu" + } + + template = { + boot_disk = { + size_gibibytes = var.gpu_disk_size_gib + type = var.gpu_disk_type + } + + service_account_id = nebius_iam_v1_service_account.k8s_nodes.id + + network_interfaces = [ + { + subnet_id = var.subnet_id + public_ip_address = var.gpu_nodes_assign_public_ip ? {} : null + } + ] + + resources = { + platform = var.gpu_nodes_platform + preset = var.gpu_nodes_preset + } + + # GPU cluster for InfiniBand + gpu_cluster = var.enable_gpu_cluster ? nebius_compute_v1_gpu_cluster.main[0] : null + + # Preemptible configuration + preemptible = var.gpu_nodes_preemptible ? { + on_preemption = "STOP" + priority = 3 + } : null + + # Taints for GPU nodes + taints = var.enable_gpu_taints ? [ + { + key = "nvidia.com/gpu" + value = "true" + effect = "NO_SCHEDULE" + } + ] : null + + filesystems = var.enable_filestore && var.filestore_id != null ? [ + { + attach_mode = "READ_WRITE" + mount_tag = "data" + existing_filesystem = { + id = var.filestore_id + } + } + ] : null + + cloud_init_user_data = templatefile("${path.module}/templates/cloud-init.yaml", { + ssh_user_name = var.ssh_user_name + ssh_public_key = var.ssh_public_key + enable_filestore = var.enable_filestore + }) + } +} diff --git a/applications/osmo/deploy/001-iac/modules/k8s/outputs.tf b/applications/osmo/deploy/001-iac/modules/k8s/outputs.tf new file mode 100755 index 000000000..bdae92cf3 --- /dev/null +++ b/applications/osmo/deploy/001-iac/modules/k8s/outputs.tf @@ -0,0 +1,38 @@ +# ============================================================================= +# Kubernetes Module Outputs +# ============================================================================= + +output "cluster_id" { + description = "Kubernetes cluster ID" + value = nebius_mk8s_v1_cluster.main.id +} + +output "cluster_name" { + description = "Kubernetes cluster name" + value = nebius_mk8s_v1_cluster.main.name +} + +output "cluster_endpoint" { + description = "Kubernetes API endpoint" + value = var.enable_public_endpoint ? ( + nebius_mk8s_v1_cluster.main.status.control_plane.endpoints.public_endpoint + ) : ( + try(nebius_mk8s_v1_cluster.main.status.control_plane.endpoints.private_endpoint, "") + ) +} + +output "cluster_ca_certificate" { + description = "Kubernetes cluster CA certificate" + value = nebius_mk8s_v1_cluster.main.status.control_plane.auth.cluster_ca_certificate + sensitive = true +} + +output "service_account_id" { + description = "Service account ID for node groups" + value = nebius_iam_v1_service_account.k8s_nodes.id +} + +output "gpu_cluster_id" { + description = "GPU cluster ID" + value = var.enable_gpu_cluster && var.gpu_nodes_count_per_group > 0 ? nebius_compute_v1_gpu_cluster.main[0].id : null +} diff --git a/applications/osmo/deploy/001-iac/modules/k8s/templates/cloud-init.yaml b/applications/osmo/deploy/001-iac/modules/k8s/templates/cloud-init.yaml new file mode 100755 index 000000000..096cf18b0 --- /dev/null +++ b/applications/osmo/deploy/001-iac/modules/k8s/templates/cloud-init.yaml @@ -0,0 +1,26 @@ +#cloud-config +%{ if ssh_public_key != null && ssh_public_key != "" ~} +users: + - name: ${ssh_user_name} + sudo: ALL=(ALL) NOPASSWD:ALL + shell: /bin/bash + ssh_authorized_keys: + - ${ssh_public_key} +%{ endif ~} + +package_update: true +packages: + - nfs-common + - curl + - jq + +%{ if enable_filestore ~} +runcmd: + # Mount filestore if attached + - | + if [ -b /dev/disk/by-id/virtio-data ]; then + mkdir -p /mnt/data + mount -t virtiofs data /mnt/data || true + echo "data /mnt/data virtiofs defaults 0 0" >> /etc/fstab + fi +%{ endif ~} diff --git a/applications/osmo/deploy/001-iac/modules/k8s/variables.tf b/applications/osmo/deploy/001-iac/modules/k8s/variables.tf new file mode 100755 index 000000000..e01b7eaac --- /dev/null +++ b/applications/osmo/deploy/001-iac/modules/k8s/variables.tf @@ -0,0 +1,192 @@ +# ============================================================================= +# Kubernetes Module Variables +# ============================================================================= + +variable "parent_id" { + description = "Nebius project ID" + type = string +} + +variable "tenant_id" { + description = "Nebius tenant ID" + type = string +} + +variable "region" { + description = "Nebius region" + type = string +} + +variable "name_prefix" { + description = "Prefix for resource names" + type = string +} + +# ----------------------------------------------------------------------------- +# Network Configuration +# ----------------------------------------------------------------------------- + +variable "subnet_id" { + description = "Subnet ID for the cluster" + type = string +} + +# ----------------------------------------------------------------------------- +# Cluster Configuration +# ----------------------------------------------------------------------------- + +variable "k8s_version" { + description = "Kubernetes version" + type = string + default = null +} + +variable "etcd_cluster_size" { + description = "Size of etcd cluster" + type = number + default = 3 +} + +variable "enable_public_endpoint" { + description = "Enable public endpoint for Kubernetes API" + type = bool + default = true +} + +# ----------------------------------------------------------------------------- +# SSH Access +# ----------------------------------------------------------------------------- + +variable "ssh_user_name" { + description = "SSH username for node access" + type = string + default = "ubuntu" +} + +variable "ssh_public_key" { + description = "SSH public key for node access" + type = string +} + +# ----------------------------------------------------------------------------- +# CPU Node Group Configuration +# ----------------------------------------------------------------------------- + +variable "cpu_nodes_count" { + description = "Number of CPU nodes" + type = number + default = 3 +} + +variable "cpu_nodes_platform" { + description = "Platform for CPU nodes" + type = string + default = "cpu-d3" +} + +variable "cpu_nodes_preset" { + description = "Resource preset for CPU nodes" + type = string + default = "16vcpu-64gb" +} + +variable "cpu_disk_type" { + description = "Disk type for CPU nodes" + type = string + default = "NETWORK_SSD" +} + +variable "cpu_disk_size_gib" { + description = "Disk size in GiB for CPU nodes" + type = number + default = 128 +} + +variable "cpu_nodes_assign_public_ip" { + description = "Assign public IPs to CPU nodes" + type = bool + default = true +} + +# ----------------------------------------------------------------------------- +# GPU Node Group Configuration +# ----------------------------------------------------------------------------- + +variable "gpu_nodes_count_per_group" { + description = "Number of GPU nodes per group" + type = number + default = 1 +} + +variable "gpu_node_groups" { + description = "Number of GPU node groups" + type = number + default = 1 +} + +variable "gpu_nodes_platform" { + description = "Platform for GPU nodes" + type = string +} + +variable "gpu_nodes_preset" { + description = "Resource preset for GPU nodes" + type = string +} + +variable "gpu_disk_type" { + description = "Disk type for GPU nodes" + type = string + default = "NETWORK_SSD" +} + +variable "gpu_disk_size_gib" { + description = "Disk size in GiB for GPU nodes" + type = number + default = 1023 +} + +variable "gpu_nodes_assign_public_ip" { + description = "Assign public IPs to GPU nodes" + type = bool + default = false +} + +variable "enable_gpu_cluster" { + description = "Enable GPU cluster with InfiniBand" + type = bool + default = true +} + +variable "infiniband_fabric" { + description = "InfiniBand fabric name" + type = string +} + +variable "enable_gpu_taints" { + description = "Add NoSchedule taint to GPU nodes" + type = bool + default = true +} + +variable "gpu_nodes_preemptible" { + description = "Use preemptible GPU nodes" + type = bool + default = false +} + +# ----------------------------------------------------------------------------- +# Filestore Configuration +# ----------------------------------------------------------------------------- + +variable "enable_filestore" { + description = "Enable filestore attachment" + type = bool + default = true +} + +variable "filestore_id" { + description = "Filestore ID to attach" + type = string + default = null +} diff --git a/applications/osmo/deploy/001-iac/modules/k8s/versions.tf b/applications/osmo/deploy/001-iac/modules/k8s/versions.tf new file mode 100755 index 000000000..4505d171a --- /dev/null +++ b/applications/osmo/deploy/001-iac/modules/k8s/versions.tf @@ -0,0 +1,7 @@ +terraform { + required_providers { + nebius = { + source = "terraform-provider.storage.eu-north1.nebius.cloud/nebius/nebius" + } + } +} diff --git a/applications/osmo/deploy/001-iac/modules/platform/main.tf b/applications/osmo/deploy/001-iac/modules/platform/main.tf new file mode 100755 index 000000000..0b75c3af9 --- /dev/null +++ b/applications/osmo/deploy/001-iac/modules/platform/main.tf @@ -0,0 +1,215 @@ +# ============================================================================= +# Platform Module - VPC, Storage, PostgreSQL, Container Registry +# ============================================================================= + +# ----------------------------------------------------------------------------- +# VPC Network +# ----------------------------------------------------------------------------- +resource "nebius_vpc_v1_network" "main" { + parent_id = var.parent_id + name = "${var.name_prefix}-network" +} + +resource "nebius_vpc_v1_subnet" "main" { + parent_id = var.parent_id + name = "${var.name_prefix}-subnet" + network_id = nebius_vpc_v1_network.main.id + + # Use network's default pools - more reliable across regions + ipv4_private_pools = { + use_network_pools = true + } +} + +# ----------------------------------------------------------------------------- +# Service Account for Storage +# ----------------------------------------------------------------------------- +resource "nebius_iam_v1_service_account" "storage" { + parent_id = var.parent_id + name = "${var.name_prefix}-storage-sa" +} + +# Get the "editors" group from tenant (grants storage.editor permissions) +# Reference: nebius-solutions-library/anyscale/deploy/bucket_key.tf +data "nebius_iam_v1_group" "editors" { + name = "editors" + parent_id = var.tenant_id +} + +# Add the storage service account to the editors group +# This grants the service account permissions to write to storage buckets +resource "nebius_iam_v1_group_membership" "storage_editor" { + parent_id = data.nebius_iam_v1_group.editors.id + member_id = nebius_iam_v1_service_account.storage.id +} + +resource "nebius_iam_v2_access_key" "storage" { + parent_id = var.parent_id + name = "${var.name_prefix}-storage-key" + description = "Access key for OSMO storage bucket" + + # Store secret in MysteryBox instead of returning directly + # Reference: nebius-solutions-library/modules/o11y/loki.tf + secret_delivery_mode = "MYSTERY_BOX" + + account = { + service_account = { + id = nebius_iam_v1_service_account.storage.id + } + } + + depends_on = [nebius_iam_v1_group_membership.storage_editor] +} + +# ----------------------------------------------------------------------------- +# MysteryBox - Read storage secret (ephemeral, NOT stored in state) +# Reference: nebius-solutions-library/modules/o11y/mysterybox.tf +# Requires Terraform >= 1.10.0 +# ----------------------------------------------------------------------------- +ephemeral "nebius_mysterybox_v1_secret_payload_entry" "storage_secret" { + secret_id = nebius_iam_v2_access_key.storage.status.secret_reference_id + key = "secret" +} + +# ----------------------------------------------------------------------------- +# Object Storage Bucket +# ----------------------------------------------------------------------------- +resource "nebius_storage_v1_bucket" "main" { + parent_id = var.parent_id + name = var.storage_bucket_name + versioning_policy = "ENABLED" +} + +# ----------------------------------------------------------------------------- +# Shared Filesystem (Filestore) +# ----------------------------------------------------------------------------- +resource "nebius_compute_v1_filesystem" "shared" { + count = var.enable_filestore ? 1 : 0 + + parent_id = var.parent_id + name = "${var.name_prefix}-filestore" + type = var.filestore_disk_type + size_bytes = var.filestore_size_gib * 1024 * 1024 * 1024 + block_size_bytes = var.filestore_block_size_kib * 1024 + + lifecycle { + ignore_changes = [labels] + } +} + +# ----------------------------------------------------------------------------- +# PostgreSQL Password (from MysteryBox - REQUIRED) +# ----------------------------------------------------------------------------- +# MysteryBox secret ID MUST be provided when using Managed PostgreSQL. +# This ensures passwords are NEVER stored in Terraform state. +# +# Setup: Run 'source ./secrets-init.sh' BEFORE 'terraform apply' +# +# Nebius PostgreSQL password requirements: +# - Min. 8 characters +# - At least one lowercase, uppercase, digit, special char EXCEPT % +# ----------------------------------------------------------------------------- + +# Validate that MysteryBox secret is provided when PostgreSQL is enabled +resource "terraform_data" "validate_postgresql_secret" { + count = var.enable_managed_postgresql ? 1 : 0 + + lifecycle { + precondition { + condition = var.postgresql_mysterybox_secret_id != null + error_message = <<-EOT + + ══════════════════════════════════════════════════════════════════════ + ERROR: PostgreSQL MysteryBox secret ID is required! + ══════════════════════════════════════════════════════════════════════ + + You must run secrets-init.sh BEFORE terraform apply: + + cd ../000-prerequisites + source ./secrets-init.sh + cd ../001-iac + terraform apply + + This creates the PostgreSQL password in MysteryBox and sets: + TF_VAR_postgresql_mysterybox_secret_id + + Without this, Terraform cannot securely configure PostgreSQL. + ══════════════════════════════════════════════════════════════════════ + EOT + } + } +} + +# Read password from MysteryBox (ephemeral - NOT stored in state) +ephemeral "nebius_mysterybox_v1_secret_payload_entry" "postgresql_password" { + count = var.enable_managed_postgresql && var.postgresql_mysterybox_secret_id != null ? 1 : 0 + secret_id = var.postgresql_mysterybox_secret_id + key = "password" +} + +# Local to get the password from MysteryBox +locals { + postgresql_password = ( + !var.enable_managed_postgresql + ? null # PostgreSQL disabled + : var.postgresql_mysterybox_secret_id != null + ? ephemeral.nebius_mysterybox_v1_secret_payload_entry.postgresql_password[0].data.string_value + : null # Will fail validation above + ) +} + +# ----------------------------------------------------------------------------- +# Managed PostgreSQL (MSP) - Nebius Managed Service for PostgreSQL +# Enabled by default for production-ready database service +# ----------------------------------------------------------------------------- +resource "nebius_msp_postgresql_v1alpha1_cluster" "main" { + count = var.enable_managed_postgresql ? 1 : 0 + parent_id = var.parent_id + name = "${var.name_prefix}-postgresql" + network_id = nebius_vpc_v1_network.main.id + + config = { + version = var.postgresql_version + public_access = var.postgresql_public_access + + template = { + disk = { + size_gibibytes = var.postgresql_disk_size_gib + type = var.postgresql_disk_type + } + resources = { + platform = var.postgresql_platform + preset = var.postgresql_preset + } + hosts = { + count = var.postgresql_host_count + } + } + } + + bootstrap = { + db_name = var.postgresql_database_name + user_name = var.postgresql_username + # NOTE: user_password moved to sensitive block (write-only, not stored in state) + } + + # Write-only field - password is NOT stored in Terraform state (more secure) + # Requires Terraform >= 1.11.0 + sensitive = { + bootstrap = { + user_password = local.postgresql_password + } + } +} + +# ----------------------------------------------------------------------------- +# Container Registry +# Reference: https://docs.nebius.com/terraform-provider/reference/resources/registry_v1_registry +# Registry endpoint format: cr..nebius.cloud/ +# ----------------------------------------------------------------------------- +resource "nebius_registry_v1_registry" "main" { + count = var.enable_container_registry ? 1 : 0 + + parent_id = var.parent_id + name = var.container_registry_name != "" ? var.container_registry_name : "${var.name_prefix}-registry" +} diff --git a/applications/osmo/deploy/001-iac/modules/platform/outputs.tf b/applications/osmo/deploy/001-iac/modules/platform/outputs.tf new file mode 100755 index 000000000..0d54886bc --- /dev/null +++ b/applications/osmo/deploy/001-iac/modules/platform/outputs.tf @@ -0,0 +1,140 @@ +# ============================================================================= +# Platform Module Outputs +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Network Outputs +# ----------------------------------------------------------------------------- +output "network_id" { + description = "VPC network ID" + value = nebius_vpc_v1_network.main.id +} + +output "subnet_id" { + description = "VPC subnet ID" + value = nebius_vpc_v1_subnet.main.id +} + +# ----------------------------------------------------------------------------- +# Storage Outputs +# ----------------------------------------------------------------------------- +output "storage_bucket_name" { + description = "Storage bucket name" + value = nebius_storage_v1_bucket.main.name +} + +output "storage_endpoint" { + description = "S3-compatible storage endpoint (dynamic from region)" + value = "https://storage.${var.region}.nebius.cloud" +} + +# TOS format endpoint for OSMO workflows +# See: TODO.md Issue #9 - s3:// doesn't work, tos:// does +output "storage_tos_endpoint" { + description = "TOS-format endpoint for OSMO workflow configuration" + value = "tos://storage.${var.region}.nebius.cloud/${nebius_storage_v1_bucket.main.name}" +} + +output "storage_access_key_id" { + description = "Storage access key ID" + value = nebius_iam_v2_access_key.storage.status.aws_access_key_id + sensitive = true +} + +# Storage secret is ephemeral (not stored in state) - retrieve via CLI: +# nebius mysterybox v1 payload get-by-key \ +# --secret-id $(terraform output -raw storage_secret_reference_id) \ +# --key secret_access_key --format json | jq -r '.data.string_value' +output "storage_secret_access_key" { + description = "Storage secret access key - use CLI command above to retrieve (ephemeral, not in state)" + value = null # Ephemeral values cannot be output; use MysteryBox CLI + sensitive = true +} + +# MysteryBox secret reference ID (for external secret management tools) +output "storage_secret_reference_id" { + description = "MysteryBox secret reference ID for storage credentials" + value = nebius_iam_v2_access_key.storage.status.secret_reference_id +} + +# ----------------------------------------------------------------------------- +# Filestore Outputs +# ----------------------------------------------------------------------------- +output "filestore_id" { + description = "Filestore ID" + value = var.enable_filestore ? nebius_compute_v1_filesystem.shared[0].id : null +} + +output "filestore_size_bytes" { + description = "Filestore size in bytes" + value = var.enable_filestore ? nebius_compute_v1_filesystem.shared[0].size_bytes : null +} + +# ----------------------------------------------------------------------------- +# PostgreSQL Outputs (Nebius Managed Service) +# ----------------------------------------------------------------------------- +output "enable_managed_postgresql" { + description = "Whether managed PostgreSQL is enabled" + value = var.enable_managed_postgresql +} + +output "postgresql_host" { + description = "PostgreSQL host (null if using in-cluster PostgreSQL)" + value = var.enable_managed_postgresql ? nebius_msp_postgresql_v1alpha1_cluster.main[0].status.connection_endpoints.private_read_write : null +} + +output "postgresql_port" { + description = "PostgreSQL port" + value = 5432 +} + +output "postgresql_database" { + description = "PostgreSQL database name" + value = var.enable_managed_postgresql ? nebius_msp_postgresql_v1alpha1_cluster.main[0].bootstrap.db_name : var.postgresql_database_name +} + +output "postgresql_username" { + description = "PostgreSQL username" + value = var.enable_managed_postgresql ? nebius_msp_postgresql_v1alpha1_cluster.main[0].bootstrap.user_name : var.postgresql_username +} + +output "postgresql_password" { + description = "PostgreSQL password (null - always use MysteryBox to retrieve)" + # Note: Password is stored in MysteryBox and cannot be output directly. + # Use the CLI to retrieve: nebius mysterybox v1 payload get-by-key --secret-id --key password + value = null + sensitive = true +} + +output "postgresql_mysterybox_secret_id" { + description = "MysteryBox secret ID for PostgreSQL password (if configured)" + value = var.postgresql_mysterybox_secret_id +} + +output "mek_mysterybox_secret_id" { + description = "MysteryBox secret ID for MEK (if configured)" + value = var.mek_mysterybox_secret_id +} + +# ----------------------------------------------------------------------------- +# Container Registry Outputs +# ----------------------------------------------------------------------------- +output "enable_container_registry" { + description = "Whether Container Registry is enabled" + value = var.enable_container_registry +} + +output "container_registry_id" { + description = "Container Registry ID" + value = var.enable_container_registry ? nebius_registry_v1_registry.main[0].id : null +} + +output "container_registry_name" { + description = "Container Registry name" + value = var.enable_container_registry ? nebius_registry_v1_registry.main[0].name : null +} + +output "container_registry_endpoint" { + description = "Container Registry endpoint for docker login/push" + value = var.enable_container_registry ? nebius_registry_v1_registry.main[0].status.registry_fqdn : null +} diff --git a/applications/osmo/deploy/001-iac/modules/platform/variables.tf b/applications/osmo/deploy/001-iac/modules/platform/variables.tf new file mode 100755 index 000000000..8d7f801a3 --- /dev/null +++ b/applications/osmo/deploy/001-iac/modules/platform/variables.tf @@ -0,0 +1,208 @@ +# ============================================================================= +# Platform Module Variables +# ============================================================================= + +variable "parent_id" { + description = "Nebius project ID" + type = string +} + +variable "tenant_id" { + description = "Nebius tenant ID (required for IAM group membership)" + type = string +} + +variable "region" { + description = "Nebius region" + type = string +} + +variable "name_prefix" { + description = "Prefix for resource names" + type = string +} + +# ----------------------------------------------------------------------------- +# Network Configuration +# ----------------------------------------------------------------------------- + +variable "vpc_cidr" { + description = "CIDR block for VPC subnet" + type = string + default = "10.0.0.0/16" +} + +# ----------------------------------------------------------------------------- +# Storage Configuration +# ----------------------------------------------------------------------------- + +variable "storage_bucket_name" { + description = "Name for storage bucket" + type = string +} + +# ----------------------------------------------------------------------------- +# Filestore Configuration +# ----------------------------------------------------------------------------- + +variable "enable_filestore" { + description = "Enable shared filesystem" + type = bool + default = true +} + +variable "filestore_disk_type" { + description = "Filestore disk type" + type = string + default = "NETWORK_SSD" +} + +variable "filestore_size_gib" { + description = "Filestore size in GiB" + type = number + default = 1024 +} + +variable "filestore_block_size_kib" { + description = "Filestore block size in KiB" + type = number + default = 4 +} + +# ----------------------------------------------------------------------------- +# PostgreSQL Configuration +# ----------------------------------------------------------------------------- + +variable "enable_managed_postgresql" { + description = "Enable Nebius Managed PostgreSQL deployment" + type = bool + default = true +} + +variable "postgresql_version" { + description = "PostgreSQL version" + type = number + default = 16 + + validation { + condition = contains([14, 15, 16], var.postgresql_version) + error_message = "PostgreSQL version must be 14, 15, or 16." + } +} + +variable "postgresql_public_access" { + description = "Enable public access to PostgreSQL (for testing only, not recommended for production)" + type = bool + default = false +} + +variable "postgresql_platform" { + description = "PostgreSQL platform (cpu-e2 for eu-north1, cpu-d3 for eu-west1)" + type = string + default = "cpu-e2" + + validation { + condition = contains(["cpu-d3", "cpu-e2"], var.postgresql_platform) + error_message = "PostgreSQL platform must be cpu-e2 (eu-north1) or cpu-d3 (eu-west1)." + } +} + +variable "postgresql_preset" { + description = "PostgreSQL resource preset (2vcpu-8gb is minimum)" + type = string + default = "2vcpu-8gb" + + validation { + condition = contains(["2vcpu-8gb", "4vcpu-16gb", "8vcpu-32gb", "16vcpu-64gb"], var.postgresql_preset) + error_message = "PostgreSQL preset must be 2vcpu-8gb, 4vcpu-16gb, 8vcpu-32gb, or 16vcpu-64gb." + } +} + +variable "postgresql_disk_type" { + description = "PostgreSQL disk type (network-ssd for eu-north1, nbs-csi-sc for eu-west1)" + type = string + default = "network-ssd" + + validation { + condition = contains(["nbs-csi-sc", "network-ssd"], var.postgresql_disk_type) + error_message = "PostgreSQL disk type must be network-ssd (eu-north1) or nbs-csi-sc (eu-west1)." + } +} + +variable "postgresql_disk_size_gib" { + description = "PostgreSQL disk size in GiB" + type = number + default = 50 +} + +variable "postgresql_host_count" { + description = "Number of PostgreSQL hosts" + type = number + default = 1 +} + +variable "postgresql_database_name" { + description = "PostgreSQL database name" + type = string + default = "osmo" +} + +variable "postgresql_username" { + description = "PostgreSQL admin username" + type = string + default = "osmo_admin" +} + +# ----------------------------------------------------------------------------- +# MysteryBox Secret IDs (REQUIRED for Managed PostgreSQL) +# ----------------------------------------------------------------------------- +# MysteryBox secret ID is REQUIRED when using Managed PostgreSQL. +# This ensures passwords are NEVER stored in Terraform state. +# +# REQUIRED setup (before terraform apply): +# 1. cd deploy/000-prerequisites +# 2. source ./secrets-init.sh +# 3. cd ../001-iac && terraform apply +# +# The script sets TF_VAR_postgresql_mysterybox_secret_id automatically. +# If you forget, Terraform will fail with a clear error message. +# ----------------------------------------------------------------------------- + +variable "postgresql_mysterybox_secret_id" { + description = "MysteryBox secret ID for PostgreSQL password (REQUIRED when enable_managed_postgresql=true)" + type = string + default = null + + validation { + condition = var.postgresql_mysterybox_secret_id == null || can(regex("^mbsec-", var.postgresql_mysterybox_secret_id)) + error_message = "PostgreSQL MysteryBox secret ID must start with 'mbsec-'. Run: source ./secrets-init.sh" + } +} + +variable "mek_mysterybox_secret_id" { + description = "MysteryBox secret ID for MEK (Master Encryption Key)" + type = string + default = null + + validation { + condition = var.mek_mysterybox_secret_id == null || can(regex("^mbsec-", var.mek_mysterybox_secret_id)) + error_message = "MEK MysteryBox secret ID must start with 'mbsec-'. Run: source ./secrets-init.sh" + } +} + +# ----------------------------------------------------------------------------- +# Container Registry Configuration +# Reference: https://docs.nebius.com/terraform-provider/reference/resources/registry_v1_registry +# ----------------------------------------------------------------------------- + +variable "enable_container_registry" { + description = "Enable Nebius Container Registry for storing container images" + type = bool + default = true +} + +variable "container_registry_name" { + description = "Custom name for container registry (defaults to -registry)" + type = string + default = "" +} diff --git a/applications/osmo/deploy/001-iac/modules/platform/versions.tf b/applications/osmo/deploy/001-iac/modules/platform/versions.tf new file mode 100755 index 000000000..d056669b0 --- /dev/null +++ b/applications/osmo/deploy/001-iac/modules/platform/versions.tf @@ -0,0 +1,11 @@ +terraform { + required_providers { + nebius = { + source = "terraform-provider.storage.eu-north1.nebius.cloud/nebius/nebius" + } + random = { + source = "hashicorp/random" + version = ">= 3.0" + } + } +} diff --git a/applications/osmo/deploy/001-iac/modules/wireguard/main.tf b/applications/osmo/deploy/001-iac/modules/wireguard/main.tf new file mode 100755 index 000000000..5cb6027db --- /dev/null +++ b/applications/osmo/deploy/001-iac/modules/wireguard/main.tf @@ -0,0 +1,70 @@ +# ============================================================================= +# WireGuard VPN Module +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Public IP Allocation +# ----------------------------------------------------------------------------- +resource "nebius_vpc_v1_allocation" "wireguard" { + parent_id = var.parent_id + name = "${var.name_prefix}-wireguard-ip" + + ipv4_public = { + cidr = "/32" + subnet_id = var.subnet_id + } + + lifecycle { + create_before_destroy = true + } +} + +# ----------------------------------------------------------------------------- +# Boot Disk +# ----------------------------------------------------------------------------- +resource "nebius_compute_v1_disk" "wireguard" { + parent_id = var.parent_id + name = "${var.name_prefix}-wireguard-boot" + size_bytes = var.disk_size_gib * 1024 * 1024 * 1024 + block_size_bytes = 4096 + type = "NETWORK_SSD" + source_image_family = { image_family = "ubuntu22.04-driverless" } +} + +# ----------------------------------------------------------------------------- +# WireGuard Instance +# ----------------------------------------------------------------------------- +resource "nebius_compute_v1_instance" "wireguard" { + parent_id = var.parent_id + name = "${var.name_prefix}-wireguard" + + boot_disk = { + attach_mode = "READ_WRITE" + existing_disk = nebius_compute_v1_disk.wireguard + } + + network_interfaces = [ + { + name = "eth0" + subnet_id = var.subnet_id + ip_address = {} + public_ip_address = { + allocation_id = nebius_vpc_v1_allocation.wireguard.id + } + } + ] + + resources = { + platform = var.platform + preset = var.preset + } + + cloud_init_user_data = templatefile("${path.module}/templates/cloud-init.yaml", { + ssh_user_name = var.ssh_user_name + ssh_public_key = var.ssh_public_key + wg_port = var.wg_port + wg_network = var.wg_network + vpc_cidr = var.vpc_cidr + ui_port = var.ui_port + }) +} diff --git a/applications/osmo/deploy/001-iac/modules/wireguard/outputs.tf b/applications/osmo/deploy/001-iac/modules/wireguard/outputs.tf new file mode 100755 index 000000000..93a54f4f5 --- /dev/null +++ b/applications/osmo/deploy/001-iac/modules/wireguard/outputs.tf @@ -0,0 +1,28 @@ +# ============================================================================= +# WireGuard Module Outputs +# ============================================================================= + +output "public_ip" { + description = "WireGuard server public IP" + value = nebius_vpc_v1_allocation.wireguard.status.details.allocated_cidr +} + +output "private_ip" { + description = "WireGuard server private IP" + value = nebius_compute_v1_instance.wireguard.status.network_interfaces[0].ip_address.address +} + +output "ui_url" { + description = "WireGuard Web UI URL" + value = "http://${nebius_vpc_v1_allocation.wireguard.status.details.allocated_cidr}:${var.ui_port}" +} + +output "ssh_command" { + description = "SSH command to connect" + value = "ssh ${var.ssh_user_name}@${nebius_vpc_v1_allocation.wireguard.status.details.allocated_cidr}" +} + +output "instance_id" { + description = "WireGuard instance ID" + value = nebius_compute_v1_instance.wireguard.id +} diff --git a/applications/osmo/deploy/001-iac/modules/wireguard/templates/cloud-init.yaml b/applications/osmo/deploy/001-iac/modules/wireguard/templates/cloud-init.yaml new file mode 100755 index 000000000..a8a3dca80 --- /dev/null +++ b/applications/osmo/deploy/001-iac/modules/wireguard/templates/cloud-init.yaml @@ -0,0 +1,122 @@ +#cloud-config +users: + - name: ${ssh_user_name} + sudo: ALL=(ALL) NOPASSWD:ALL + shell: /bin/bash + ssh_authorized_keys: + - ${ssh_public_key} + +package_update: true +package_upgrade: true +packages: + - wireguard + - wireguard-tools + - ufw + +write_files: + - content: | + [Unit] + Description=Restart WireGuard + After=network.target + [Service] + Type=oneshot + ExecStart=/usr/bin/systemctl restart wg-quick@wg0.service + [Install] + RequiredBy=wgui.path + path: /etc/systemd/system/wgui.service + permissions: "0400" + owner: root:root + - content: | + [Unit] + Description=Watch /etc/wireguard/wg0.conf for changes + [Path] + PathModified=/etc/wireguard/wg0.conf + [Install] + WantedBy=multi-user.target + path: /etc/systemd/system/wgui.path + permissions: "0400" + owner: root:root + - content: | + [Unit] + Description=WireGuard UI Server + After=network.target + Wants=network-online.target systemd-networkd-wait-online.service + + [Service] + ExecStart=/opt/wireguard-ui + Restart=on-abnormal + User=root + Group=root + WorkingDirectory=/var/lib/wireguard-ui + Environment="WGUI_PASSWORD_FILE=/var/lib/wireguard-ui/initial_password" + Environment="WGUI_LOG_LEVEL=DEBUG" + + [Install] + WantedBy=multi-user.target + path: /etc/systemd/system/wgui_server.service + permissions: "0400" + owner: root:root + +runcmd: + # Generate WireGuard keys + - wg genkey | sudo tee /etc/wireguard/private.key + - sudo chmod go= /etc/wireguard/private.key + - sudo cat /etc/wireguard/private.key | wg pubkey | sudo tee /etc/wireguard/public.key + + # Create WireGuard configuration + - export PRIVATE_KEY=$(sudo cat /etc/wireguard/private.key) + - export INTERFACE=$(ip route list default | awk '{for(i=1;i<=NF;i++) if($i=="dev") print $(i+1)}') + - | + sudo tee /etc/wireguard/wg0.conf < +# 4. Access cluster via private endpoint +# ============================================================================= diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example b/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example new file mode 100755 index 000000000..4e5fbba56 --- /dev/null +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example @@ -0,0 +1,86 @@ +# ============================================================================= +# OSMO on Nebius - Cost-Optimized Configuration +# ============================================================================= +# This configuration minimizes costs for development and testing. +# Estimated cost: ~$15-25 per 6-hour session (with L40S + small CPU nodes) +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Required Settings (get from nebius-env-init.sh) +# ----------------------------------------------------------------------------- +# tenant_id = "your-tenant-id" +# parent_id = "your-project-id" + +# ----------------------------------------------------------------------------- +# Environment Settings +# ----------------------------------------------------------------------------- +region = "eu-north1" +environment = "dev" +project_name = "osmo-dev" + +# ----------------------------------------------------------------------------- +# Network Settings +# ----------------------------------------------------------------------------- +# Note: /16 may exhaust VPC pool capacity. Using /20 (4,096 addresses) instead. +vpc_cidr = "10.0.0.0/20" + +# ----------------------------------------------------------------------------- +# Kubernetes Cluster +# ----------------------------------------------------------------------------- +k8s_version = null # Use latest +etcd_cluster_size = 1 # Single node for dev +enable_public_endpoint = true # Direct API access + +# ----------------------------------------------------------------------------- +# CPU Nodes (minimal for system workloads) +# ----------------------------------------------------------------------------- +cpu_nodes_count = 2 +cpu_nodes_platform = "cpu-d3" +cpu_nodes_preset = "4vcpu-16gb" # Smallest viable for K8s +cpu_disk_size_gib = 64 +cpu_nodes_assign_public_ip = false # Private nodes only + +# CPU preset options (cpu-d3): +# 2vcpu-8gb - ~$0.08/hr (may be too small for K8s) +# 4vcpu-16gb - ~$0.11/hr (minimum recommended) +# 8vcpu-32gb - ~$0.22/hr (comfortable for dev) +# 16vcpu-64gb - ~$0.44/hr (production) + +# ----------------------------------------------------------------------------- +# GPU Nodes (L40S - cheapest option ~$1.55/hr vs H100 ~$4-5/hr) +# ----------------------------------------------------------------------------- +gpu_nodes_count_per_group = 1 +gpu_node_groups = 1 +gpu_nodes_platform = "gpu-l40s-a" # L40S Intel (cheapest) +gpu_nodes_preset = "1gpu-8vcpu-32gb" # Single L40S GPU +gpu_disk_size_gib = 256 +gpu_nodes_assign_public_ip = false +enable_gpu_cluster = false # No InfiniBand for L40S +enable_gpu_taints = true +gpu_nodes_preemptible = false # Set true if your project allows preemptible GPUs + +# Other GPU options in eu-north1: +# gpu-l40s-a (L40S Intel, 48GB) - cheapest ~$1.55/hr +# gpu-l40s-d (L40S AMD, 48GB) - cheap ~$1.55/hr +# gpu-h100-sxm (H100, 80GB) - ~$4-5/hr +# gpu-h200-sxm (H200, 141GB) - most expensive + +# ----------------------------------------------------------------------------- +# Storage (minimal) +# ----------------------------------------------------------------------------- +enable_filestore = true +filestore_size_gib = 256 # Smaller filestore + +# ----------------------------------------------------------------------------- +# PostgreSQL (Nebius Managed Service - minimal) +# ----------------------------------------------------------------------------- +postgresql_platform = "cpu-e2" # cpu-e2 for eu-north1 +postgresql_preset = "2vcpu-8gb" # Minimum preset (cheapest) +postgresql_disk_type = "network-ssd" # Required for eu-north1 +postgresql_disk_size_gib = 20 +postgresql_host_count = 1 + +# ----------------------------------------------------------------------------- +# WireGuard VPN (disabled for dev) +# ----------------------------------------------------------------------------- +enable_wireguard = false diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.example b/applications/osmo/deploy/001-iac/terraform.tfvars.example new file mode 100755 index 000000000..bfbc36dba --- /dev/null +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.example @@ -0,0 +1,72 @@ +# ============================================================================= +# OSMO on Nebius - Terraform Variables +# ============================================================================= +# Copy this file to terraform.tfvars and customize for your deployment. +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Required Settings +# ----------------------------------------------------------------------------- +# Get these from: source ../000-prerequisites/nebius-env-init.sh + +tenant_id = "your-tenant-id" # From NEBIUS_TENANT_ID +parent_id = "your-project-id" # From NEBIUS_PROJECT_ID + +# ----------------------------------------------------------------------------- +# Environment Settings +# ----------------------------------------------------------------------------- +region = "eu-north1" # or eu-west1 +environment = "dev" # dev, staging, prod +project_name = "osmo" # Used for resource naming + +# ----------------------------------------------------------------------------- +# Kubernetes Cluster +# ----------------------------------------------------------------------------- +k8s_version = null # null for latest +etcd_cluster_size = 3 # 1, 3, or 5 +enable_public_endpoint = true # Set false if using WireGuard + +# ----------------------------------------------------------------------------- +# CPU Node Group +# ----------------------------------------------------------------------------- +cpu_nodes_count = 3 +cpu_nodes_platform = "cpu-d3" +cpu_nodes_preset = "16vcpu-64gb" +cpu_disk_size_gib = 128 +cpu_nodes_assign_public_ip = false # Private nodes only + +# ----------------------------------------------------------------------------- +# GPU Node Group +# ----------------------------------------------------------------------------- +gpu_nodes_count_per_group = 1 +gpu_node_groups = 1 +gpu_nodes_platform = "gpu-h100-sxm" # or gpu-h200-sxm +gpu_nodes_preset = "8gpu-128vcpu-1600gb" +gpu_disk_size_gib = 1023 +gpu_nodes_assign_public_ip = false # Private nodes only +enable_gpu_cluster = true # InfiniBand +enable_gpu_taints = true +gpu_nodes_preemptible = false # Preemptible requires project permissions + +# ----------------------------------------------------------------------------- +# Storage +# ----------------------------------------------------------------------------- +enable_filestore = true +filestore_size_gib = 1024 + +# ----------------------------------------------------------------------------- +# PostgreSQL (Nebius Managed Service) +# ----------------------------------------------------------------------------- +# Platform depends on region: cpu-d3 (eu-west1), cpu-e2 (eu-north1) +postgresql_platform = "cpu-e2" # Adjust for your region +postgresql_preset = "2vcpu-8gb" # 2/4/8/16 vcpu options available +postgresql_disk_type = "network-ssd" # network-ssd (eu-north1), nbs-csi-sc (eu-west1) +postgresql_disk_size_gib = 50 +postgresql_host_count = 1 + +# ----------------------------------------------------------------------------- +# WireGuard VPN (Optional) +# ----------------------------------------------------------------------------- +enable_wireguard = false +# wireguard_port = 51820 +# wireguard_ui_port = 5000 diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.production.example b/applications/osmo/deploy/001-iac/terraform.tfvars.production.example new file mode 100755 index 000000000..9bda9b0fd --- /dev/null +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.production.example @@ -0,0 +1,84 @@ +# ============================================================================= +# OSMO on Nebius - Production Configuration +# ============================================================================= +# This configuration provides full redundancy and performance. +# Estimated cost: ~$1000+ per 6-hour session +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Required Settings (get from nebius-env-init.sh) +# ----------------------------------------------------------------------------- +# tenant_id = "your-tenant-id" +# parent_id = "your-project-id" + +# ----------------------------------------------------------------------------- +# Environment Settings +# ----------------------------------------------------------------------------- +region = "eu-north1" +environment = "prod" +project_name = "osmo-prod" + +# ----------------------------------------------------------------------------- +# Network Settings +# ----------------------------------------------------------------------------- +# Note: /16 may exhaust VPC pool capacity. Using /20 (4,096 addresses) instead. +vpc_cidr = "10.0.0.0/20" + +# ----------------------------------------------------------------------------- +# Kubernetes Cluster +# ----------------------------------------------------------------------------- +k8s_version = null # Use latest +etcd_cluster_size = 3 # HA etcd +enable_public_endpoint = false # Private endpoint only + +# ----------------------------------------------------------------------------- +# CPU Nodes (production grade) +# ----------------------------------------------------------------------------- +cpu_nodes_count = 3 +cpu_nodes_platform = "cpu-d3" +cpu_nodes_preset = "16vcpu-64gb" +cpu_disk_type = "NETWORK_SSD" +cpu_disk_size_gib = 256 +cpu_nodes_assign_public_ip = false + +# ----------------------------------------------------------------------------- +# GPU Nodes (full 8-GPU nodes with InfiniBand) +# ----------------------------------------------------------------------------- +gpu_nodes_count_per_group = 2 +gpu_node_groups = 2 +gpu_nodes_platform = "gpu-h200-sxm" +gpu_nodes_preset = "8gpu-128vcpu-1600gb" +gpu_disk_type = "NETWORK_SSD" +gpu_disk_size_gib = 1023 +gpu_nodes_assign_public_ip = false +enable_gpu_cluster = true # InfiniBand enabled +infiniband_fabric = null # Use region default +enable_gpu_taints = true +gpu_nodes_preemptible = false # Preemptible requires project permissions + +# ----------------------------------------------------------------------------- +# Storage (production grade) +# ----------------------------------------------------------------------------- +enable_filestore = true +filestore_disk_type = "NETWORK_SSD" +filestore_size_gib = 4096 # 4 TiB +filestore_block_size_kib = 4 + +# ----------------------------------------------------------------------------- +# PostgreSQL (Nebius Managed Service - HA) +# ----------------------------------------------------------------------------- +postgresql_platform = "cpu-e2" # cpu-e2 for eu-north1 +postgresql_preset = "16vcpu-64gb" # Production size +postgresql_disk_type = "network-ssd" # Required for eu-north1 +postgresql_disk_size_gib = 100 +postgresql_host_count = 3 # HA with replicas + +# ----------------------------------------------------------------------------- +# WireGuard VPN (enabled for secure access) +# ----------------------------------------------------------------------------- +enable_wireguard = true +wireguard_platform = "cpu-e2" +wireguard_preset = "2vcpu-8gb" +wireguard_disk_size_gib = 64 +wireguard_port = 51820 +wireguard_ui_port = 5000 diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example b/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example new file mode 100755 index 000000000..e8a98c8f1 --- /dev/null +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example @@ -0,0 +1,87 @@ +# ============================================================================= +# OSMO on Nebius - Secure Configuration with WireGuard +# ============================================================================= +# This configuration prioritizes security with private-only access. +# All cluster access goes through WireGuard VPN. +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Required Settings (get from nebius-env-init.sh) +# ----------------------------------------------------------------------------- +# tenant_id = "your-tenant-id" +# parent_id = "your-project-id" + +# ----------------------------------------------------------------------------- +# Environment Settings +# ----------------------------------------------------------------------------- +region = "eu-north1" +environment = "staging" +project_name = "osmo-secure" + +# ----------------------------------------------------------------------------- +# Network Settings +# ----------------------------------------------------------------------------- +# Note: /16 may exhaust VPC pool capacity. Using /20 (4,096 addresses) instead. +vpc_cidr = "10.0.0.0/20" + +# ----------------------------------------------------------------------------- +# Kubernetes Cluster (PRIVATE ONLY) +# ----------------------------------------------------------------------------- +k8s_version = null +etcd_cluster_size = 3 +enable_public_endpoint = false # No public K8s API + +# ----------------------------------------------------------------------------- +# CPU Nodes (NO public IPs) +# ----------------------------------------------------------------------------- +cpu_nodes_count = 3 +cpu_nodes_platform = "cpu-d3" +cpu_nodes_preset = "16vcpu-64gb" +cpu_disk_size_gib = 128 +cpu_nodes_assign_public_ip = false # Private only + +# ----------------------------------------------------------------------------- +# GPU Nodes (NO public IPs) +# ----------------------------------------------------------------------------- +gpu_nodes_count_per_group = 1 +gpu_node_groups = 1 +gpu_nodes_platform = "gpu-h100-sxm" +gpu_nodes_preset = "8gpu-128vcpu-1600gb" +gpu_disk_size_gib = 1023 +gpu_nodes_assign_public_ip = false # Private only +enable_gpu_cluster = true +enable_gpu_taints = true +gpu_nodes_preemptible = false # Preemptible requires project permissions + +# ----------------------------------------------------------------------------- +# Storage +# ----------------------------------------------------------------------------- +enable_filestore = true +filestore_size_gib = 1024 + +# ----------------------------------------------------------------------------- +# PostgreSQL (Nebius Managed Service) +# ----------------------------------------------------------------------------- +postgresql_platform = "cpu-e2" # cpu-e2 for eu-north1 +postgresql_preset = "2vcpu-8gb" # Minimum preset +postgresql_disk_type = "network-ssd" # Required for eu-north1 +postgresql_disk_size_gib = 50 +postgresql_host_count = 1 + +# ----------------------------------------------------------------------------- +# WireGuard VPN (REQUIRED for this config) +# ----------------------------------------------------------------------------- +enable_wireguard = true +wireguard_platform = "cpu-e2" +wireguard_preset = "2vcpu-8gb" +wireguard_disk_size_gib = 64 +wireguard_port = 51820 +wireguard_network = "10.8.0.0/24" +wireguard_ui_port = 5000 + +# ============================================================================= +# After deployment: +# 1. Set up WireGuard client: ./000-prerequisites/wireguard-client-setup.sh +# 2. Connect to VPN +# 3. Access cluster via private endpoint +# ============================================================================= diff --git a/applications/osmo/deploy/001-iac/variables.tf b/applications/osmo/deploy/001-iac/variables.tf new file mode 100755 index 000000000..576623a3b --- /dev/null +++ b/applications/osmo/deploy/001-iac/variables.tf @@ -0,0 +1,437 @@ +# ============================================================================= +# Global Configuration +# ============================================================================= + +variable "tenant_id" { + description = "Nebius tenant ID" + type = string +} + +variable "parent_id" { + description = "Nebius project ID" + type = string +} + +variable "region" { + description = "Nebius region for deployment" + type = string + default = "eu-north1" + + validation { + condition = contains(["eu-north1", "eu-west1", "eu-north2", "us-central1"], var.region) + error_message = "Region must be one of: eu-north1, eu-west1, eu-north2, us-central1" + } +} + +variable "environment" { + description = "Environment name (dev, stg, tst, pro)" + type = string + default = "dev" + + validation { + condition = contains(["dev", "stg", "tst", "pro"], var.environment) + error_message = "Environment must be one of: dev, stg, tst, pro" + } +} + +variable "project_name" { + description = "Project name used for resource naming" + type = string + default = "osmo" +} + +# ============================================================================= +# Network Configuration +# ============================================================================= + +variable "vpc_cidr" { + description = "CIDR block for VPC subnet (/20 recommended - /16 may exhaust pool)" + type = string + default = "10.0.0.0/20" + + validation { + condition = can(cidrhost(var.vpc_cidr, 0)) + error_message = "VPC CIDR must be a valid CIDR block" + } +} + +# ============================================================================= +# SSH Access +# ============================================================================= + +variable "ssh_user_name" { + description = "SSH username for node access" + type = string + default = "ubuntu" +} + +variable "ssh_public_key" { + description = "SSH public key for node access" + type = object({ + key = optional(string) + path = optional(string, "~/.ssh/id_rsa.pub") + }) + default = {} +} + +# ============================================================================= +# Kubernetes Cluster Configuration +# ============================================================================= + +variable "k8s_version" { + description = "Kubernetes version (null for latest)" + type = string + default = null +} + +variable "etcd_cluster_size" { + description = "Size of etcd cluster (1, 3, or 5)" + type = number + default = 3 + + validation { + condition = contains([1, 3, 5], var.etcd_cluster_size) + error_message = "etcd cluster size must be 1, 3, or 5" + } +} + +variable "enable_public_endpoint" { + description = "Enable public endpoint for Kubernetes API" + type = bool + default = true +} + +# ============================================================================= +# CPU Node Group Configuration +# ============================================================================= + +variable "cpu_nodes_count" { + description = "Number of CPU nodes" + type = number + default = 3 + + validation { + condition = var.cpu_nodes_count >= 1 && var.cpu_nodes_count <= 100 + error_message = "CPU nodes count must be between 1 and 100" + } +} + +variable "cpu_nodes_platform" { + description = "Platform for CPU nodes" + type = string + default = "cpu-d3" +} + +variable "cpu_nodes_preset" { + description = "Resource preset for CPU nodes" + type = string + default = "16vcpu-64gb" +} + +variable "cpu_disk_type" { + description = "Disk type for CPU nodes" + type = string + default = "NETWORK_SSD" +} + +variable "cpu_disk_size_gib" { + description = "Disk size in GiB for CPU nodes" + type = number + default = 128 +} + +variable "cpu_nodes_assign_public_ip" { + description = "Assign public IPs to CPU nodes" + type = bool + default = false # Private by default for security +} + +# ============================================================================= +# GPU Node Group Configuration +# ============================================================================= + +variable "gpu_nodes_count_per_group" { + description = "Number of GPU nodes per group" + type = number + default = 1 + + validation { + condition = var.gpu_nodes_count_per_group >= 0 && var.gpu_nodes_count_per_group <= 32 + error_message = "GPU nodes per group must be between 0 and 32" + } +} + +variable "gpu_node_groups" { + description = "Number of GPU node groups" + type = number + default = 1 +} + +variable "gpu_nodes_platform" { + description = "Platform for GPU nodes" + type = string + default = null +} + +variable "gpu_nodes_preset" { + description = "Resource preset for GPU nodes" + type = string + default = null +} + +variable "gpu_disk_type" { + description = "Disk type for GPU nodes" + type = string + default = "NETWORK_SSD" +} + +variable "gpu_disk_size_gib" { + description = "Disk size in GiB for GPU nodes" + type = number + default = 1023 +} + +variable "gpu_nodes_assign_public_ip" { + description = "Assign public IPs to GPU nodes" + type = bool + default = false +} + +variable "enable_gpu_cluster" { + description = "Enable GPU cluster with InfiniBand" + type = bool + default = true +} + +variable "infiniband_fabric" { + description = "InfiniBand fabric name (null for region default)" + type = string + default = null +} + +variable "enable_gpu_taints" { + description = "Add NoSchedule taint to GPU nodes" + type = bool + default = true +} + +variable "gpu_nodes_preemptible" { + description = "Use preemptible GPU nodes (up to 70% cost savings)" + type = bool + default = false +} + +# ============================================================================= +# Filestore Configuration +# ============================================================================= + +variable "enable_filestore" { + description = "Enable shared filesystem" + type = bool + default = true +} + +variable "filestore_disk_type" { + description = "Filestore disk type" + type = string + default = "NETWORK_SSD" +} + +variable "filestore_size_gib" { + description = "Filestore size in GiB" + type = number + default = 1024 +} + +variable "filestore_block_size_kib" { + description = "Filestore block size in KiB" + type = number + default = 4 +} + +# ============================================================================= +# Object Storage Configuration +# ============================================================================= + +variable "storage_bucket_name" { + description = "Name for the storage bucket (must be globally unique)" + type = string + default = "" +} + +# ============================================================================= +# PostgreSQL Configuration +# Region-specific options: +# eu-west1: platform=cpu-d3, disk=nbs-csi-sc +# eu-north1: platform=cpu-e2, disk=network-ssd +# Presets (both regions): 2vcpu-8gb, 4vcpu-16gb, 8vcpu-32gb, 16vcpu-64gb +# ============================================================================= + +variable "enable_managed_postgresql" { + description = "Enable Nebius Managed PostgreSQL deployment" + type = bool + default = true +} + +variable "postgresql_version" { + description = "PostgreSQL version (14, 15, or 16)" + type = number + default = 16 + + validation { + condition = contains([14, 15, 16], var.postgresql_version) + error_message = "PostgreSQL version must be 14, 15, or 16." + } +} + +variable "postgresql_public_access" { + description = "Enable public access to PostgreSQL (for testing only, not recommended for production)" + type = bool + default = false +} + +variable "postgresql_platform" { + description = "PostgreSQL platform (cpu-e2 for eu-north1, cpu-d3 for eu-west1)" + type = string + default = "cpu-e2" +} + +variable "postgresql_preset" { + description = "PostgreSQL resource preset (2vcpu-8gb is minimum)" + type = string + default = "2vcpu-8gb" +} + +variable "postgresql_disk_type" { + description = "PostgreSQL disk type (network-ssd for eu-north1, nbs-csi-sc for eu-west1)" + type = string + default = "network-ssd" +} + +variable "postgresql_disk_size_gib" { + description = "PostgreSQL disk size in GiB" + type = number + default = 50 +} + +variable "postgresql_host_count" { + description = "Number of PostgreSQL hosts" + type = number + default = 1 + + validation { + condition = var.postgresql_host_count >= 1 && var.postgresql_host_count <= 3 + error_message = "PostgreSQL host count must be between 1 and 3" + } +} + +variable "postgresql_database_name" { + description = "PostgreSQL database name" + type = string + default = "osmo" +} + +variable "postgresql_username" { + description = "PostgreSQL admin username" + type = string + default = "osmo_admin" +} + +# ============================================================================= +# Container Registry Configuration +# Reference: https://docs.nebius.com/terraform-provider/reference/resources/registry_v1_registry +# ============================================================================= + +variable "enable_container_registry" { + description = "Enable Nebius Container Registry for storing container images" + type = bool + default = true +} + +variable "container_registry_name" { + description = "Custom name for the container registry (defaults to --registry)" + type = string + default = "" +} + +# ============================================================================= +# MysteryBox Secrets Configuration (REQUIRED for Managed PostgreSQL) +# ============================================================================= +# These variables MUST be set when using Managed PostgreSQL. +# Secrets are stored in MysteryBox, keeping them OUT of Terraform state. +# +# REQUIRED Setup (before terraform apply): +# 1. cd deploy/000-prerequisites +# 2. source ./secrets-init.sh +# 3. This sets TF_VAR_postgresql_mysterybox_secret_id automatically +# +# If you see validation errors, you forgot to run secrets-init.sh! +# ============================================================================= + +variable "postgresql_mysterybox_secret_id" { + description = "MysteryBox secret ID for PostgreSQL password (REQUIRED - set by secrets-init.sh)" + type = string + default = null + + validation { + condition = var.postgresql_mysterybox_secret_id == null || can(regex("^mbsec-", var.postgresql_mysterybox_secret_id)) + error_message = "PostgreSQL MysteryBox secret ID must start with 'mbsec-' (e.g., mbsec-e00xxx). Run: source ./secrets-init.sh" + } +} + +variable "mek_mysterybox_secret_id" { + description = "MysteryBox secret ID for OSMO MEK (Master Encryption Key)" + type = string + default = null + + validation { + condition = var.mek_mysterybox_secret_id == null || can(regex("^mbsec-", var.mek_mysterybox_secret_id)) + error_message = "MEK MysteryBox secret ID must start with 'mbsec-' (e.g., mbsec-e00xxx). Run: source ./secrets-init.sh" + } +} + +# ============================================================================= +# WireGuard VPN Configuration +# ============================================================================= + +variable "enable_wireguard" { + description = "Enable WireGuard VPN for private access" + type = bool + default = false +} + +variable "wireguard_platform" { + description = "Platform for WireGuard instance" + type = string + default = "cpu-e2" +} + +variable "wireguard_preset" { + description = "Resource preset for WireGuard instance" + type = string + default = "2vcpu-8gb" +} + +variable "wireguard_disk_size_gib" { + description = "Disk size for WireGuard instance" + type = number + default = 64 +} + +variable "wireguard_port" { + description = "WireGuard UDP port" + type = number + default = 51820 +} + +variable "wireguard_network" { + description = "WireGuard VPN network CIDR" + type = string + default = "10.8.0.0/24" +} + +variable "wireguard_ui_port" { + description = "WireGuard Web UI port" + type = number + default = 5000 +} diff --git a/applications/osmo/deploy/001-iac/versions.tf b/applications/osmo/deploy/001-iac/versions.tf new file mode 100755 index 000000000..6042f66dd --- /dev/null +++ b/applications/osmo/deploy/001-iac/versions.tf @@ -0,0 +1,25 @@ +terraform { + # Requires >= 1.10.0 for ephemeral resources (MysteryBox integration) + # Requires >= 1.11.0 for write-only sensitive fields (PostgreSQL password) + required_version = ">= 1.11.0" + + required_providers { + nebius = { + source = "terraform-provider.storage.eu-north1.nebius.cloud/nebius/nebius" + } + random = { + source = "hashicorp/random" + version = ">= 3.0" + } + units = { + source = "dstaroff/units" + version = ">= 1.1.1" + } + } +} + +provider "nebius" { + domain = "api.eu.nebius.cloud:443" +} + +provider "random" {} diff --git a/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh b/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh new file mode 100755 index 000000000..58e6f1241 --- /dev/null +++ b/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# +# Deploy GPU Infrastructure (GPU Operator, Network Operator, KAI Scheduler) +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " GPU Infrastructure Deployment" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 +check_helm || exit 1 + +# Add Helm repos +log_info "Adding Helm repositories..." +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --force-update +helm repo update + +# ----------------------------------------------------------------------------- +# Deploy GPU Operator +# ----------------------------------------------------------------------------- +log_info "Deploying NVIDIA GPU Operator..." + +kubectl create namespace "${GPU_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + +helm upgrade --install gpu-operator nvidia/gpu-operator \ + --namespace "${GPU_OPERATOR_NAMESPACE}" \ + --values "${VALUES_DIR}/gpu-operator.yaml" \ + --timeout 10m + +log_success "GPU Operator deployed (pods will become ready when GPU nodes are available)" + +# Brief wait for core operator pod only (not GPU node components) +sleep 10 +kubectl get pods -n "${GPU_OPERATOR_NAMESPACE}" --no-headers 2>/dev/null | head -5 || true + +# ----------------------------------------------------------------------------- +# Deploy Network Operator (for InfiniBand) - OPTIONAL +# ----------------------------------------------------------------------------- +if [[ "${ENABLE_NETWORK_OPERATOR:-false}" == "true" ]]; then + log_info "Deploying NVIDIA Network Operator (InfiniBand support)..." + + kubectl create namespace "${NETWORK_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + + helm upgrade --install network-operator nvidia/network-operator \ + --namespace "${NETWORK_OPERATOR_NAMESPACE}" \ + --values "${VALUES_DIR}/network-operator.yaml" \ + --timeout 10m + + log_success "Network Operator deployed" + + # Brief wait and show status + sleep 5 + kubectl get pods -n "${NETWORK_OPERATOR_NAMESPACE}" --no-headers 2>/dev/null | head -5 || true +else + log_info "Skipping Network Operator (set ENABLE_NETWORK_OPERATOR=true to install)" +fi + +# ----------------------------------------------------------------------------- +# Deploy KAI Scheduler (from NVIDIA OCI registry) +# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/dependencies/dependencies.html +# ----------------------------------------------------------------------------- +log_info "Deploying KAI Scheduler..." + +kubectl create namespace "${KAI_SCHEDULER_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + +# Install directly from OCI registry +KAI_VERSION="${KAI_SCHEDULER_VERSION:-0.4.0}" +helm upgrade --install kai-scheduler \ + oci://ghcr.io/nvidia/kai-scheduler/kai-scheduler \ + --version "${KAI_VERSION}" \ + --namespace "${KAI_SCHEDULER_NAMESPACE}" \ + --values "${VALUES_DIR}/kai-scheduler.yaml" \ + --timeout 5m + +log_success "KAI Scheduler deployed" + +# Brief wait and show status +sleep 5 +kubectl get pods -n "${KAI_SCHEDULER_NAMESPACE}" --no-headers 2>/dev/null | head -5 || true + +# ----------------------------------------------------------------------------- +# Verify Installation +# ----------------------------------------------------------------------------- +echo "" +log_info "Verifying GPU infrastructure..." + +# Check GPU nodes +GPU_NODES=$(kubectl get nodes -l node-type=gpu -o name 2>/dev/null | wc -l) +if [[ $GPU_NODES -gt 0 ]]; then + log_success "Found $GPU_NODES GPU node(s)" + kubectl get nodes -l node-type=gpu -o wide +else + log_warning "No GPU nodes found yet (they may still be provisioning)" +fi + +echo "" +echo "========================================" +log_success "GPU Infrastructure deployment complete!" +echo "========================================" +echo "" +echo "Next step: ./02-deploy-observability.sh" +echo "" diff --git a/applications/osmo/deploy/002-setup/02-deploy-observability.sh b/applications/osmo/deploy/002-setup/02-deploy-observability.sh new file mode 100755 index 000000000..ef3c22f13 --- /dev/null +++ b/applications/osmo/deploy/002-setup/02-deploy-observability.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# +# Deploy Observability Stack (Prometheus, Grafana, Loki) +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " Observability Stack Deployment" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 +check_helm || exit 1 + +# Add Helm repos +log_info "Adding Helm repositories..." +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts --force-update +helm repo add grafana https://grafana.github.io/helm-charts --force-update +helm repo update + +# Create namespace +kubectl create namespace "${MONITORING_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + +# Generate Grafana password if not set +if [[ -z "$GRAFANA_ADMIN_PASSWORD" ]]; then + GRAFANA_ADMIN_PASSWORD=$(openssl rand -base64 16) + log_info "Generated Grafana admin password" +fi + +# ----------------------------------------------------------------------------- +# Deploy Prometheus +# ----------------------------------------------------------------------------- +log_info "Deploying Prometheus..." + +helm upgrade --install prometheus prometheus-community/kube-prometheus-stack \ + --namespace "${MONITORING_NAMESPACE}" \ + --values "${VALUES_DIR}/prometheus.yaml" \ + --set grafana.adminPassword="${GRAFANA_ADMIN_PASSWORD}" \ + --wait --timeout 10m + +log_success "Prometheus stack deployed" + +# ----------------------------------------------------------------------------- +# Deploy Loki +# ----------------------------------------------------------------------------- +log_info "Deploying Loki..." + +helm upgrade --install loki grafana/loki-stack \ + --namespace "${MONITORING_NAMESPACE}" \ + --values "${VALUES_DIR}/loki.yaml" \ + --wait --timeout 10m + +log_success "Loki deployed" + +# ----------------------------------------------------------------------------- +# Deploy Promtail +# ----------------------------------------------------------------------------- +log_info "Deploying Promtail..." + +helm upgrade --install promtail grafana/promtail \ + --namespace "${MONITORING_NAMESPACE}" \ + --values "${VALUES_DIR}/promtail.yaml" \ + --wait --timeout 5m + +log_success "Promtail deployed" + +# ----------------------------------------------------------------------------- +# Configure Grafana Datasources +# ----------------------------------------------------------------------------- +log_info "Configuring Grafana datasources..." + +# Loki datasource is auto-configured via values + +# Wait for Grafana +wait_for_pods "${MONITORING_NAMESPACE}" "app.kubernetes.io/name=grafana" 180 + +# ----------------------------------------------------------------------------- +# Output Access Information +# ----------------------------------------------------------------------------- +echo "" +echo "========================================" +log_success "Observability stack deployment complete!" +echo "========================================" +echo "" +echo "Access Grafana:" +echo " kubectl port-forward -n ${MONITORING_NAMESPACE} svc/prometheus-grafana 3000:80" +echo " URL: http://localhost:3000" +echo " Username: admin" +echo " Password: ${GRAFANA_ADMIN_PASSWORD}" +echo "" +echo "Access Prometheus:" +echo " kubectl port-forward -n ${MONITORING_NAMESPACE} svc/prometheus-kube-prometheus-prometheus 9090:9090" +echo " URL: http://localhost:9090" +echo "" +echo "Next step: ./03-deploy-osmo-control-plane.sh" +echo "" diff --git a/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh new file mode 100755 index 000000000..4b53b718d --- /dev/null +++ b/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh @@ -0,0 +1,1319 @@ +#!/bin/bash +# +# Deploy OSMO Service (Control Plane) +# https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html +# +# Components: API Service, Router, Web UI, Worker, Logger, Agent, Keycloak +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " OSMO Service Deployment" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 +check_helm || exit 1 + +# ----------------------------------------------------------------------------- +# Configuration +# ----------------------------------------------------------------------------- +OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" +# Deploy Keycloak in same namespace as PostgreSQL to simplify DNS resolution +KEYCLOAK_NAMESPACE="${OSMO_NAMESPACE}" +OSMO_DOMAIN="${OSMO_DOMAIN:-osmo.local}" + +# Keycloak admin password - check for existing secret first to maintain consistency +if [[ -z "${KEYCLOAK_ADMIN_PASSWORD:-}" ]]; then + # Try to get existing password from secret + EXISTING_KC_PASS=$(kubectl get secret keycloak-admin-secret -n "${OSMO_NAMESPACE}" -o jsonpath='{.data.password}' 2>/dev/null | base64 -d 2>/dev/null || true) + if [[ -n "${EXISTING_KC_PASS}" ]]; then + KEYCLOAK_ADMIN_PASSWORD="${EXISTING_KC_PASS}" + log_info "Using existing Keycloak admin password from secret" + else + KEYCLOAK_ADMIN_PASSWORD="$(openssl rand -base64 12)" + log_info "Generated new Keycloak admin password" + fi +fi + +# ----------------------------------------------------------------------------- +# Get Database Configuration from Terraform (Nebius Managed PostgreSQL) +# ----------------------------------------------------------------------------- +log_info "Using Nebius Managed PostgreSQL..." + log_info "Retrieving database configuration..." + + # Get connection details from Terraform outputs + POSTGRES_HOST=$(get_tf_output "postgresql.host" "../001-iac" || echo "") + POSTGRES_PORT=$(get_tf_output "postgresql.port" "../001-iac" || echo "5432") + POSTGRES_DB=$(get_tf_output "postgresql.database" "../001-iac" || echo "osmo") + POSTGRES_USER=$(get_tf_output "postgresql.username" "../001-iac" || echo "osmo_admin") + + # Get password - try MysteryBox first, then Terraform output, then env vars + # MysteryBox secret ID is set by secrets-init.sh as TF_VAR_postgresql_mysterybox_secret_id + POSTGRES_SECRET_ID="${TF_VAR_postgresql_mysterybox_secret_id:-${OSMO_POSTGRESQL_SECRET_ID:-}}" + + if [[ -n "$POSTGRES_SECRET_ID" ]]; then + log_info "Reading PostgreSQL password from MysteryBox (secret: $POSTGRES_SECRET_ID)..." + POSTGRES_PASSWORD=$(get_mysterybox_secret "$POSTGRES_SECRET_ID" "password" || echo "") + if [[ -n "$POSTGRES_PASSWORD" ]]; then + log_success "PostgreSQL password retrieved from MysteryBox" + else + log_warning "Failed to read password from MysteryBox" + fi + fi + + # Fall back to Terraform output (only works if not using MysteryBox) + if [[ -z "$POSTGRES_PASSWORD" ]]; then + POSTGRES_PASSWORD=$(get_tf_output "postgresql_password" "../001-iac" || echo "") + fi + + # Fall back to environment variables or prompt + if [[ -z "$POSTGRES_HOST" || -z "$POSTGRES_PASSWORD" ]]; then + log_warning "Could not retrieve PostgreSQL configuration automatically" + log_info "Checking environment variables..." + + POSTGRES_HOST=${POSTGRES_HOST:-${OSMO_POSTGRES_HOST:-""}} + POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-${OSMO_POSTGRES_PASSWORD:-""}} + + if [[ -z "$POSTGRES_HOST" ]]; then + read -p "PostgreSQL Host: " POSTGRES_HOST + fi + if [[ -z "$POSTGRES_PASSWORD" ]]; then + read -s -p "PostgreSQL Password: " POSTGRES_PASSWORD + echo "" + fi + fi + +log_success "Database: ${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}" + +# ----------------------------------------------------------------------------- +# Get Storage Configuration +# ----------------------------------------------------------------------------- +log_info "Retrieving storage configuration..." + +S3_BUCKET=$(get_tf_output "storage_bucket.name" "../001-iac" || echo "") +S3_ENDPOINT=$(get_tf_output "storage_bucket.endpoint" "../001-iac" || echo "https://storage.eu-north1.nebius.cloud") +S3_ACCESS_KEY=$(get_tf_output "storage_credentials.access_key_id" "../001-iac" || echo "") + +# Secret access key is stored in MysteryBox (ephemeral, not in Terraform state) +S3_SECRET_REF_ID=$(get_tf_output "storage_secret_reference_id" "../001-iac" || echo "") +S3_SECRET_KEY="" + +if [[ -n "$S3_SECRET_REF_ID" ]]; then + log_info "Retrieving storage secret from MysteryBox..." + # IAM access key secrets are stored with key "secret" in MysteryBox + S3_SECRET_KEY=$(get_mysterybox_secret "$S3_SECRET_REF_ID" "secret" || echo "") + if [[ -n "$S3_SECRET_KEY" ]]; then + log_success "Storage secret retrieved from MysteryBox" + else + log_warning "Could not retrieve storage secret from MysteryBox" + fi +fi + +if [[ -n "$S3_BUCKET" ]]; then + log_success "Storage: ${S3_BUCKET} @ ${S3_ENDPOINT}" +fi + +# ----------------------------------------------------------------------------- +# Add Helm Repositories +# ----------------------------------------------------------------------------- +log_info "Adding Helm repositories..." +helm repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo --force-update +helm repo add bitnami https://charts.bitnami.com/bitnami --force-update +helm repo update + +# ----------------------------------------------------------------------------- +# Step 1: Create Namespaces +# ----------------------------------------------------------------------------- +log_info "Creating namespace..." +kubectl create namespace "${OSMO_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - +# Note: Keycloak is deployed in the same namespace as OSMO (no separate namespace needed) + +# ----------------------------------------------------------------------------- +# Step 2: Configure PostgreSQL - Verify Connection and Create Databases +# ----------------------------------------------------------------------------- +log_info "Verifying PostgreSQL connection..." + + # Delete any existing test/init pods + kubectl delete pod osmo-db-test -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null + kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null + + # Create a temporary secret with DB credentials + # NOTE: PGDATABASE must be the bootstrap database ('osmo') for Nebius MSP PostgreSQL + kubectl create secret generic osmo-db-init-creds \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=PGPASSWORD="${POSTGRES_PASSWORD}" \ + --from-literal=PGHOST="${POSTGRES_HOST}" \ + --from-literal=PGPORT="${POSTGRES_PORT}" \ + --from-literal=PGUSER="${POSTGRES_USER}" \ + --from-literal=PGDATABASE="${POSTGRES_DB}" \ + --dry-run=client -o yaml | kubectl apply -f - + + # ----------------------------------------------------------------------------- + # Connection Test - Verify credentials before proceeding + # ----------------------------------------------------------------------------- + log_info "Testing PostgreSQL connection (this may take a moment)..." + + kubectl run osmo-db-test \ + --namespace "${OSMO_NAMESPACE}" \ + --image=postgres:16-alpine \ + --restart=Never \ + --env="PGPASSWORD=${POSTGRES_PASSWORD}" \ + --env="PGHOST=${POSTGRES_HOST}" \ + --env="PGPORT=${POSTGRES_PORT}" \ + --env="PGUSER=${POSTGRES_USER}" \ + --env="PGDATABASE=${POSTGRES_DB}" \ + --command -- sh -c 'psql -c "SELECT 1" >/dev/null 2>&1 && echo "CONNECTION_OK" || echo "CONNECTION_FAILED"' \ + >/dev/null 2>&1 + + # Wait for test pod to complete + test_elapsed=0 + test_status="" + while [[ $test_elapsed -lt 60 ]]; do + test_status=$(kubectl get pod osmo-db-test -n "${OSMO_NAMESPACE}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Pending") + if [[ "$test_status" == "Succeeded" || "$test_status" == "Failed" ]]; then + break + fi + sleep 2 + ((test_elapsed += 2)) + done + + # Check test result + test_result=$(kubectl logs osmo-db-test -n "${OSMO_NAMESPACE}" 2>/dev/null || echo "") + kubectl delete pod osmo-db-test -n "${OSMO_NAMESPACE}" --ignore-not-found >/dev/null 2>&1 + + if [[ "$test_result" != *"CONNECTION_OK"* ]]; then + log_error "PostgreSQL connection test failed!" + echo "" + echo "Connection details:" + echo " Host: ${POSTGRES_HOST}" + echo " Port: ${POSTGRES_PORT}" + echo " Database: ${POSTGRES_DB}" + echo " User: ${POSTGRES_USER}" + echo " Password: (from MysteryBox secret ${TF_VAR_postgresql_mysterybox_secret_id:-'not set'})" + echo "" + echo "Possible causes:" + echo " 1. Password mismatch - MysteryBox password doesn't match PostgreSQL" + echo " Fix: Update MysteryBox or recreate PostgreSQL cluster" + echo " 2. Network issue - Cluster cannot reach PostgreSQL" + echo " 3. PostgreSQL not ready - Wait and retry" + echo "" + echo "To debug manually:" + echo " kubectl run psql-debug --rm -it --image=postgres:16-alpine -n osmo -- sh" + echo " PGPASSWORD='' psql -h ${POSTGRES_HOST} -U ${POSTGRES_USER} -d ${POSTGRES_DB}" + exit 1 + fi + + log_success "PostgreSQL connection verified" + + # ----------------------------------------------------------------------------- + # Database Creation + # ----------------------------------------------------------------------------- + if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + log_info "Creating OSMO and Keycloak databases (if not exist)..." + else + log_info "Verifying OSMO database..." + fi + + # NOTE: Nebius MSP PostgreSQL creates the bootstrap database ('osmo') automatically. + # The bootstrap user can only connect to this database, not 'postgres'. + # We connect to 'osmo' and create additional databases from there. + # Pass DEPLOY_KEYCLOAK to the init pod + kubectl apply -n "${OSMO_NAMESPACE}" -f - </dev/null 2>&1; then + echo "ERROR: Cannot connect to PostgreSQL" + echo "Debug: PGHOST=\$PGHOST, PGPORT=\$PGPORT, PGUSER=\$PGUSER, PGDATABASE=\${PGDATABASE:-osmo}" + # Try with verbose error + psql -d "\${PGDATABASE:-osmo}" -c "SELECT 1" 2>&1 || true + exit 1 + fi + echo "Connection successful to database '\${PGDATABASE:-osmo}'" + + # The 'osmo' database already exists (created by Nebius bootstrap) + echo "Database 'osmo' exists (created by Nebius MSP bootstrap)" + + # Create keycloak database only if Keycloak deployment is enabled + DEPLOY_KEYCLOAK="${DEPLOY_KEYCLOAK:-false}" + if [ "\$DEPLOY_KEYCLOAK" = "true" ]; then + # Note: This requires the user to have CREATEDB privilege + if psql -d "\${PGDATABASE:-osmo}" -tAc "SELECT 1 FROM pg_database WHERE datname='keycloak'" | grep -q 1; then + echo "Database 'keycloak' already exists" + else + echo "Creating database 'keycloak'..." + psql -d "\${PGDATABASE:-osmo}" -c "CREATE DATABASE keycloak;" || { + echo "WARNING: Could not create 'keycloak' database." + echo "The bootstrap user may not have CREATEDB privilege." + echo "Keycloak will use a schema in the 'osmo' database instead." + } + fi + fi + + # Verify databases exist + echo "" + echo "Verifying databases..." + psql -d "\${PGDATABASE:-osmo}" -c "\l" | grep -E "osmo" || true + + echo "" + echo "SUCCESS: Database initialization complete" + restartPolicy: Never +EOF + + # Wait for pod to complete (init pods may finish before Ready condition is detected) + log_info "Running database initialization..." + + # Poll for completion - init pods go directly to Completed/Succeeded very quickly + max_wait=120 + elapsed=0 + status="" + + while [[ $elapsed -lt $max_wait ]]; do + status=$(kubectl get pod osmo-db-init -n "${OSMO_NAMESPACE}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Pending") + + if [[ "$status" == "Succeeded" ]]; then + break + elif [[ "$status" == "Failed" ]]; then + log_error "Database initialization failed. Checking logs..." + kubectl logs osmo-db-init -n "${OSMO_NAMESPACE}" + kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found + exit 1 + fi + + sleep 2 + ((elapsed += 2)) + done + + if [[ "$status" != "Succeeded" ]]; then + log_error "Database initialization timed out (status: $status). Checking logs..." + kubectl logs osmo-db-init -n "${OSMO_NAMESPACE}" 2>/dev/null || true + kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found + exit 1 + fi + + # Show logs for verification + log_info "Database initialization output:" + kubectl logs osmo-db-init -n "${OSMO_NAMESPACE}" + + # Cleanup + kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found + +log_success "Databases verified and ready" + +# ----------------------------------------------------------------------------- +# Step 3: Create Secrets +# ----------------------------------------------------------------------------- +log_info "Creating secrets..." + +# Database secret for Keycloak (only if Keycloak is being deployed) +if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + kubectl create secret generic keycloak-db-secret \ + --namespace "${KEYCLOAK_NAMESPACE}" \ + --from-literal=postgres-password="${POSTGRES_PASSWORD}" \ + --dry-run=client -o yaml | kubectl apply -f - +fi + +# Create the postgres-secret that OSMO chart expects +# The chart looks for passwordSecretName: postgres-secret, passwordSecretKey: password +kubectl create secret generic postgres-secret \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=password="${POSTGRES_PASSWORD}" \ + --dry-run=client -o yaml | kubectl apply -f - + +# OIDC secrets (only needed if Keycloak is deployed) +# These are placeholder values that get overwritten with real Keycloak client secrets +if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + HMAC_SECRET=$(openssl rand -base64 32) + CLIENT_SECRET=$(openssl rand -base64 32) + kubectl create secret generic oidc-secrets \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=client_secret="${CLIENT_SECRET}" \ + --from-literal=hmac_secret="${HMAC_SECRET}" \ + --dry-run=client -o yaml | kubectl apply -f - +fi + +# Storage secret (if available) +if [[ -n "$S3_ACCESS_KEY" && -n "$S3_SECRET_KEY" ]]; then + kubectl create secret generic osmo-storage \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=access-key-id="${S3_ACCESS_KEY}" \ + --from-literal=secret-access-key="${S3_SECRET_KEY}" \ + --dry-run=client -o yaml | kubectl apply -f - +fi + +# MEK (Master Encryption Key) Configuration +# OSMO expects MEK in JWK (JSON Web Key) format, base64-encoded +# Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html +MEK_ID="${MEK_ID:-key1}" +log_info "Configuring MEK (Master Encryption Key)..." + +# Try to read MEK from MysteryBox first (set by secrets-init.sh) +# MysteryBox secret ID is set as TF_VAR_mek_mysterybox_secret_id +MEK_SECRET_ID="${TF_VAR_mek_mysterybox_secret_id:-${OSMO_MEK_SECRET_ID:-}}" +MEK_DATA="" + +if [[ -n "$MEK_SECRET_ID" ]]; then + log_info "Reading MEK from MysteryBox (secret: $MEK_SECRET_ID)..." + MEK_DATA=$(get_mysterybox_secret "$MEK_SECRET_ID" "mek" || echo "") + if [[ -n "$MEK_DATA" ]]; then + log_success "MEK retrieved from MysteryBox" + # MEK from secrets-init.sh is in format: {"currentMek":"key1","meks":{"key1":""}} + # Extract the key ID and encoded value + MEK_ID=$(echo "$MEK_DATA" | jq -r '.currentMek // "key1"' 2>/dev/null || echo "key1") + MEK_ENCODED=$(echo "$MEK_DATA" | jq -r ".meks.${MEK_ID} // empty" 2>/dev/null || echo "") + + if [[ -z "$MEK_ENCODED" ]]; then + log_warning "Could not parse MEK from MysteryBox, will generate new one" + MEK_DATA="" + fi + else + log_warning "Failed to read MEK from MysteryBox" + fi +fi + +# Generate new MEK if not retrieved from MysteryBox +if [[ -z "$MEK_DATA" || -z "$MEK_ENCODED" ]]; then + log_info "Generating new MEK in JWK format..." + MEK_KEY_RAW="$(openssl rand -base64 32 | tr -d '\n')" + MEK_JWK="{\"k\":\"${MEK_KEY_RAW}\",\"kid\":\"${MEK_ID}\",\"kty\":\"oct\"}" + MEK_ENCODED="$(echo -n "$MEK_JWK" | base64 | tr -d '\n')" + log_success "New MEK generated" +fi + +# Create MEK ConfigMap (OSMO expects ConfigMap, not Secret) +kubectl apply -n "${OSMO_NAMESPACE}" -f - </dev/null; then + log_info "Redis already deployed" +else + helm upgrade --install redis bitnami/redis \ + --namespace "${OSMO_NAMESPACE}" \ + --set architecture=standalone \ + --set auth.enabled=false \ + --set master.persistence.size=1Gi \ + --set master.resources.requests.cpu=100m \ + --set master.resources.requests.memory=128Mi \ + --wait --timeout 5m + + log_success "Redis deployed" +fi + +REDIS_HOST="redis-master.${OSMO_NAMESPACE}.svc.cluster.local" + +# ----------------------------------------------------------------------------- +# Step 4: Deploy Keycloak (Enable with DEPLOY_KEYCLOAK=true) +# ----------------------------------------------------------------------------- +# Keycloak provides authentication for OSMO +# Required for: osmo login, osmo token, backend operator +# Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak + +# Keycloak service URL (same namespace as OSMO) +KEYCLOAK_HOST="keycloak.${OSMO_NAMESPACE}.svc.cluster.local" +KEYCLOAK_URL="http://${KEYCLOAK_HOST}:80" +AUTH_DOMAIN="auth-${OSMO_DOMAIN}" + +if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + log_info "Deploying Keycloak for OSMO authentication..." + log_info "Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak" + + # ------------------------------------------------------------------------- + # Step 1: Create Keycloak database in PostgreSQL + # Per OSMO docs: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-1-configure-postgresql + # ------------------------------------------------------------------------- + log_info "Creating Keycloak database in PostgreSQL..." + + # Delete old pod if exists + kubectl delete pod osmo-db-ops -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null + + # Use the managed PostgreSQL credentials (bootstrap user has CREATEDB privilege) + cat > /tmp/keycloak-db-init.yaml </dev/null || true + sleep 5 + kubectl wait --for=jsonpath='{.status.phase}'=Succeeded pod/osmo-db-ops -n "${OSMO_NAMESPACE}" --timeout=60s || { + log_warning "Database creation pod status:" + kubectl logs -n "${OSMO_NAMESPACE}" osmo-db-ops || true + } + kubectl delete pod osmo-db-ops -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null + rm -f /tmp/keycloak-db-init.yaml + log_success "Keycloak database ready" + + # ------------------------------------------------------------------------- + # Step 2: Create secrets for Keycloak + # ------------------------------------------------------------------------- + log_info "Creating Keycloak secrets..." + + # Save admin password to secret for future re-runs + kubectl create secret generic keycloak-admin-secret \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=password="${KEYCLOAK_ADMIN_PASSWORD}" \ + --dry-run=client -o yaml | kubectl apply -f - + + # Create keycloak-db-secret for external database (per OSMO docs) + # Uses the managed PostgreSQL credentials + kubectl create secret generic keycloak-db-secret \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=postgres-password="${POSTGRES_PASSWORD}" \ + --dry-run=client -o yaml | kubectl apply -f - + + log_success "Keycloak secrets created" + + # ------------------------------------------------------------------------- + # Step 3: Install Keycloak using Bitnami Helm chart + # Per OSMO docs: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#install-keycloak-using-bitnami-helm-chart + # ------------------------------------------------------------------------- + log_info "Installing Keycloak using Bitnami Helm chart..." + + # Add Bitnami repo + helm repo add bitnami https://charts.bitnami.com/bitnami --force-update 2>/dev/null || true + helm repo update bitnami + + # Create keycloak-values.yaml per OSMO documentation + cat > /tmp/keycloak-values.yaml </dev/null | grep -q keycloak; then + break + fi + echo " Waiting for Keycloak pod to be created... ($i/30)" + sleep 5 + done + + # Now wait for it to be ready + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=keycloak \ + -n "${OSMO_NAMESPACE}" --timeout=300s || { + log_warning "Keycloak pod not ready yet, checking logs..." + kubectl logs -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/name=keycloak --tail=30 || true + } + + # Additional wait for Keycloak to fully initialize + log_info "Waiting for Keycloak to fully initialize..." + sleep 30 + + # Configure Keycloak realm and clients for OSMO + # Per documentation: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#post-installation-keycloak-configuration + log_info "Configuring Keycloak realm and clients for OSMO..." + + # Generate client secret + OIDC_CLIENT_SECRET=$(openssl rand -hex 16) + + # Create a job to configure Keycloak + cat > /tmp/keycloak-config-job.yaml < /dev/null 2>&1; then + echo "Keycloak is ready" + break + fi + echo "Attempt \$i: Keycloak not ready yet..." + sleep 15 + done + + echo "Getting admin token..." + for i in 1 2 3 4 5; do + TOKEN=\$(curl -s -X POST "\${KEYCLOAK_URL}/realms/master/protocol/openid-connect/token" \ + --data-urlencode "client_id=admin-cli" \ + --data-urlencode "username=admin" \ + --data-urlencode "password=${KEYCLOAK_ADMIN_PASSWORD}" \ + --data-urlencode "grant_type=password" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4) + if [ -n "\$TOKEN" ]; then break; fi + echo "Retry \$i: waiting for token..." + sleep 10 + done + + if [ -z "\$TOKEN" ]; then + echo "Failed to get admin token" + exit 1 + fi + echo "Got admin token" + + # Create osmo realm (per documentation) + echo "Creating osmo realm..." + curl -s -X POST "\${KEYCLOAK_URL}/admin/realms" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"realm":"osmo","enabled":true,"registrationAllowed":false}' || echo "Realm may already exist" + + # Create osmo-device client (for CLI device code flow) + # Per documentation: public client with device authorization grant + echo "Creating osmo-device client..." + curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/clients" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "clientId": "osmo-device", + "name": "OSMO Device Client", + "enabled": true, + "publicClient": true, + "directAccessGrantsEnabled": true, + "standardFlowEnabled": false, + "implicitFlowEnabled": false, + "protocol": "openid-connect", + "attributes": { + "oauth2.device.authorization.grant.enabled": "true" + } + }' || echo "Client may already exist" + + # Create osmo-browser-flow client (for web UI) + # Per documentation: confidential client with standard flow + echo "Creating osmo-browser-flow client..." + curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/clients" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "clientId": "osmo-browser-flow", + "name": "OSMO Browser Flow Client", + "enabled": true, + "publicClient": false, + "secret": "${OIDC_CLIENT_SECRET}", + "directAccessGrantsEnabled": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "serviceAccountsEnabled": true, + "protocol": "openid-connect", + "redirectUris": ["*"], + "webOrigins": ["*"] + }' || echo "Client may already exist" + + # Create a test user (per documentation) + echo "Creating osmo-admin user..." + curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/users" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "username": "osmo-admin", + "enabled": true, + "emailVerified": true, + "firstName": "OSMO", + "lastName": "Admin", + "email": "osmo-admin@example.com", + "credentials": [{"type":"password","value":"osmo-admin","temporary":false}] + }' || echo "User may already exist" + + echo "" + echo "=========================================" + echo "Keycloak OSMO configuration complete!" + echo "=========================================" + echo "Realm: osmo" + echo "Clients: osmo-device, osmo-browser-flow" + echo "Test user: osmo-admin / osmo-admin" + echo "" +EOF + + # Delete any previous config job + kubectl delete job keycloak-osmo-setup -n "${KEYCLOAK_NAMESPACE}" --ignore-not-found 2>/dev/null || true + + kubectl apply -f /tmp/keycloak-config-job.yaml + + log_info "Waiting for Keycloak configuration job..." + kubectl wait --for=condition=complete job/keycloak-osmo-setup \ + -n "${KEYCLOAK_NAMESPACE}" --timeout=300s || { + log_warning "Keycloak configuration may have failed, check logs:" + kubectl logs -n "${KEYCLOAK_NAMESPACE}" -l job-name=keycloak-osmo-setup --tail=50 || true + } + + # Store the client secret for OIDC (used by Envoy sidecar) + kubectl create secret generic oidc-secrets \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=client_secret="${OIDC_CLIENT_SECRET}" \ + --from-literal=hmac_secret="$(openssl rand -base64 32)" \ + --dry-run=client -o yaml | kubectl apply -f - + + rm -f /tmp/keycloak-values.yaml /tmp/keycloak-config-job.yaml + + log_success "Keycloak deployed and configured" + echo "" + echo "Keycloak Access:" + echo " kubectl port-forward -n ${KEYCLOAK_NAMESPACE} svc/keycloak 8081:80" + echo " URL: http://localhost:8081" + echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" + echo " Test User: osmo-admin / osmo-admin" + echo "" + echo "OSMO Auth Endpoints (in-cluster):" + echo " Token: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/token" + echo " Auth: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/auth" + echo "" + + # Keycloak is deployed but we disable OSMO's internal auth + # because OSMO's JWT validation expects its own keys, not Keycloak's + # Users can still get tokens from Keycloak for future use + AUTH_ENABLED="false" + log_info "Note: OSMO internal auth disabled (use Keycloak tokens with API directly)" +else + log_info "Skipping Keycloak (set DEPLOY_KEYCLOAK=true to enable)" + log_warning "Without Keycloak, 'osmo login' and token creation will not work" + log_info "Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak" + AUTH_ENABLED="false" +fi + +# ----------------------------------------------------------------------------- +# Step 5: Create OSMO Values File +# ----------------------------------------------------------------------------- +log_info "Creating OSMO values file..." + +# Create the values file with proper extraEnv and extraVolumes for each service +# This configures PostgreSQL password via env var and MEK via volume mount +cat > /tmp/osmo_values.yaml </dev/null || true + +log_success "OSMO Service Helm deployment complete" + +# ----------------------------------------------------------------------------- +# Step 7: Deploy Router +# ----------------------------------------------------------------------------- +log_info "Deploying OSMO Router..." + +# Router requires configFile.enabled=true to mount the mek-config ConfigMap +# It also needs db-secret (not postgres-secret) for the password +kubectl create secret generic db-secret \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=db-password="${POSTGRES_PASSWORD}" \ + --dry-run=client -o yaml | kubectl apply -f - + +helm upgrade --install osmo-router osmo/router \ + --namespace "${OSMO_NAMESPACE}" \ + --set service.type=ClusterIP \ + --set global.domain=osmo.local \ + --set services.configFile.enabled=true \ + --set services.postgres.serviceName="${POSTGRES_HOST}" \ + --set services.postgres.port=${POSTGRES_PORT} \ + --set services.postgres.db=osmo \ + --set services.postgres.user="${POSTGRES_USER}" \ + --set services.service.ingress.enabled=false \ + --set services.service.scaling.minReplicas=1 \ + --set services.service.scaling.maxReplicas=1 \ + --set sidecars.envoy.enabled=false \ + --set sidecars.logAgent.enabled=false \ + --wait --timeout 5m || log_warning "Router deployment had issues" + +log_success "OSMO Router deployed" + +# Delete router ingress +kubectl delete ingress -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/instance=osmo-router --ignore-not-found 2>/dev/null || true + +# ----------------------------------------------------------------------------- +# Step 8: Deploy Web UI (Optional) +# ----------------------------------------------------------------------------- +if [[ "${DEPLOY_UI:-true}" == "true" ]]; then + log_info "Deploying OSMO Web UI..." + + helm upgrade --install osmo-ui osmo/web-ui \ + --namespace "${OSMO_NAMESPACE}" \ + --set service.type=ClusterIP \ + --set global.domain=osmo.local \ + --set services.ui.ingress.enabled=false \ + --set services.ui.replicas=1 \ + --set services.ui.apiHostname="osmo-service.${OSMO_NAMESPACE}.svc.cluster.local:80" \ + --set sidecars.envoy.enabled=false \ + --set sidecars.logAgent.enabled=false \ + --wait --timeout 5m || log_warning "UI deployment had issues" + + log_success "OSMO Web UI deployed" + + # Delete UI ingress + kubectl delete ingress -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/instance=osmo-ui --ignore-not-found 2>/dev/null || true +fi + +# Cleanup all remaining ingress resources (final sweep) +log_info "Final cleanup of any remaining Ingress resources..." +kubectl delete ingress --all -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null || true + +# Cleanup temp files +rm -f /tmp/osmo_values.yaml + +# ----------------------------------------------------------------------------- +# Step 9: Patch Deployments to Add vault-secrets Volume +# ----------------------------------------------------------------------------- +# NOTE: The Helm chart's extraVolumes/extraVolumeMounts values don't work reliably. +# We must patch the deployments after Helm creates them to add the vault-secrets volume. +# This is a known workaround - the env vars work via extraEnv, but volumes don't. + +log_info "Patching OSMO deployments to add vault-secrets volume mount..." + +# Create the JSON patch file +cat > /tmp/vault-patch.json << 'PATCH_EOF' +[ + {"op": "add", "path": "/spec/template/spec/volumes/-", "value": {"name": "vault-secrets", "secret": {"secretName": "vault-secrets"}}}, + {"op": "add", "path": "/spec/template/spec/containers/0/volumeMounts/-", "value": {"name": "vault-secrets", "mountPath": "/home/osmo/vault-agent/secrets", "readOnly": true}} +] +PATCH_EOF + +# All OSMO deployments that need the vault-secrets volume for MEK +OSMO_DEPLOYMENTS="osmo-service osmo-worker osmo-agent osmo-logger osmo-delayed-job-monitor osmo-router" + +for deploy in $OSMO_DEPLOYMENTS; do + if kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" &>/dev/null; then + # Check if vault-secrets volume already exists + EXISTING_VOL=$(kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.spec.template.spec.volumes[*].name}' 2>/dev/null | grep -w "vault-secrets" || true) + + if [[ -z "$EXISTING_VOL" ]]; then + log_info " Patching $deploy to add vault-secrets volume..." + if kubectl patch deployment/$deploy -n "${OSMO_NAMESPACE}" --type=json --patch-file=/tmp/vault-patch.json; then + log_success " $deploy patched successfully" + else + log_warning " Failed to patch $deploy" + fi + else + log_info " $deploy already has vault-secrets volume, skipping" + fi + else + log_info " $deploy not found, skipping" + fi +done + +# Cleanup patch file +rm -f /tmp/vault-patch.json + +# Wait for rollouts to complete +log_info "Waiting for deployments to roll out with new configuration..." +for deploy in $OSMO_DEPLOYMENTS; do + if kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" &>/dev/null; then + kubectl rollout status deployment/$deploy -n "${OSMO_NAMESPACE}" --timeout=180s || \ + log_warning " Timeout waiting for $deploy rollout" + fi +done + +log_success "All OSMO deployments patched with vault-secrets volume" + +# ----------------------------------------------------------------------------- +# Step 10: Patch Services for Direct Access (without Envoy) +# ----------------------------------------------------------------------------- +# Since Envoy sidecar is disabled, services need to target port 8000 directly +# instead of the 'envoy-http' named port which doesn't exist. +# This is done automatically by the helm chart when sidecars.envoy.enabled=false + +log_info "Verifying service ports (Envoy disabled)..." + +OSMO_SERVICES="osmo-service osmo-router osmo-logger osmo-agent" + +for svc in $OSMO_SERVICES; do + if kubectl get svc "$svc" -n "${OSMO_NAMESPACE}" &>/dev/null; then + CURRENT_TARGET=$(kubectl get svc "$svc" -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.spec.ports[0].targetPort}' 2>/dev/null || echo "") + + if [[ "$CURRENT_TARGET" == "envoy-http" || "$CURRENT_TARGET" == "envoy" ]]; then + log_info " Patching $svc: targetPort envoy-http -> 8000" + kubectl patch svc "$svc" -n "${OSMO_NAMESPACE}" --type='json' \ + -p='[{"op": "replace", "path": "/spec/ports/0/targetPort", "value": 8000}]' || \ + log_warning " Failed to patch $svc" + else + log_info " $svc: targetPort = $CURRENT_TARGET (OK)" + fi + fi +done + +log_success "Service ports verified" + +# ----------------------------------------------------------------------------- +# Step 11: Deploy NGINX Proxy +# ----------------------------------------------------------------------------- +# The nginx proxy routes traffic to osmo-service, osmo-logger, and osmo-agent +# Required for osmo-ctrl sidecar to communicate with the OSMO service +log_info "Deploying OSMO proxy (nginx)..." + +if [[ -f "${SCRIPT_DIR}/nginx-proxy.yaml" ]]; then + kubectl apply -f "${SCRIPT_DIR}/nginx-proxy.yaml" + kubectl rollout status deployment/osmo-proxy -n "${OSMO_NAMESPACE}" --timeout=120s || \ + log_warning "Timeout waiting for osmo-proxy rollout" + log_success "OSMO proxy deployed" +else + log_warning "nginx-proxy.yaml not found - skipping proxy deployment" + log_warning "Workflows may fail without the proxy. Create nginx-proxy.yaml and apply manually." +fi + +# ----------------------------------------------------------------------------- +# Step 12: Verify Deployment +# ----------------------------------------------------------------------------- +echo "" +log_info "Verifying deployment configuration..." + +# Verify vault-secrets volumes are mounted +echo "" +echo "Volume configuration verification:" +for deploy in $OSMO_DEPLOYMENTS; do + if kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" &>/dev/null; then + VOL_CHECK=$(kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.spec.template.spec.volumes[*].name}' 2>/dev/null | grep -w "vault-secrets" || echo "") + ENV_CHECK=$(kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.spec.template.spec.containers[0].env[*].name}' 2>/dev/null | grep -w "OSMO_POSTGRES_PASSWORD" || echo "") + + VOL_STATUS="✗" + ENV_STATUS="✗" + [[ -n "$VOL_CHECK" ]] && VOL_STATUS="✓" + [[ -n "$ENV_CHECK" ]] && ENV_STATUS="✓" + + echo " $deploy: vault-secrets=$VOL_STATUS, postgres_env=$ENV_STATUS" + fi +done + +echo "" +echo "Pods:" +kubectl get pods -n "${OSMO_NAMESPACE}" + +echo "" +echo "Services:" +kubectl get svc -n "${OSMO_NAMESPACE}" + +# Get service URL +OSMO_SVC=$(kubectl get svc -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/name=service -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "osmo-service") +OSMO_PORT=$(kubectl get svc "${OSMO_SVC}" -n "${OSMO_NAMESPACE}" -o jsonpath='{.spec.ports[0].port}' 2>/dev/null || echo "80") + +echo "" +echo "========================================" +log_success "OSMO Service deployment complete!" +echo "========================================" +echo "" +echo "OSMO Service Access:" +echo " kubectl port-forward -n ${OSMO_NAMESPACE} svc/${OSMO_SVC} 8080:${OSMO_PORT}" +echo " URL: http://localhost:8080" +echo "" +echo "" +echo "NOTE: OSMO API authentication is DISABLED for testing." +echo " The API is accessible without tokens." +echo "" +echo "Test the API:" +echo " curl http://localhost:8080/api/version" +echo " curl http://localhost:8080/api/workflow" +echo "" +if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + echo "Keycloak Access (for future use):" + echo " kubectl port-forward -n ${KEYCLOAK_NAMESPACE} svc/keycloak 8081:80" + echo " URL: http://localhost:8081" + echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" + echo " Test User: osmo-admin / osmo-admin" + echo "" +fi +echo "Next step - Deploy Backend Operator:" +echo " ./04-deploy-osmo-backend.sh" +echo "" +echo "In-cluster URL (for pods): http://${OSMO_SVC}.${OSMO_NAMESPACE}.svc.cluster.local:${OSMO_PORT}" +echo "" diff --git a/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh b/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh new file mode 100755 index 000000000..8a57177bf --- /dev/null +++ b/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh @@ -0,0 +1,289 @@ +#!/bin/bash +# +# Deploy OSMO Backend Operator +# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/deploy_backend.html +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " OSMO Backend Operator Deployment" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 +check_helm || exit 1 + +# ----------------------------------------------------------------------------- +# Configuration +# ----------------------------------------------------------------------------- +OSMO_OPERATOR_NAMESPACE="osmo-operator" +OSMO_WORKFLOWS_NAMESPACE="osmo-workflows" +OSMO_IMAGE_TAG="${OSMO_IMAGE_TAG:-6.0.0}" +OSMO_CHART_VERSION="${OSMO_CHART_VERSION:-}" +BACKEND_NAME="${OSMO_BACKEND_NAME:-default}" + +# Check for OSMO Service URL (in-cluster URL for the backend operator pods) +# IMPORTANT: Backend operators connect via WebSocket to osmo-agent, NOT osmo-service! +# The osmo-service handles REST API, osmo-agent handles WebSocket connections for backends +if [[ -z "${OSMO_SERVICE_URL:-}" ]]; then + log_info "Auto-detecting in-cluster OSMO Agent URL..." + + # Backend operators MUST connect to osmo-agent for WebSocket connections + # The osmo-service WebSocket routes only exist in dev mode + OSMO_AGENT=$(kubectl get svc -n osmo osmo-agent -o jsonpath='{.metadata.name}' 2>/dev/null || echo "") + + if [[ -n "$OSMO_AGENT" ]]; then + OSMO_SERVICE_URL="http://osmo-agent.osmo.svc.cluster.local:80" + log_success "In-cluster Agent URL: ${OSMO_SERVICE_URL}" + else + # Fallback: try to detect from any osmo-agent service + OSMO_AGENT=$(kubectl get svc -n osmo -l app.kubernetes.io/name=agent -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [[ -n "$OSMO_AGENT" ]]; then + OSMO_SERVICE_URL="http://${OSMO_AGENT}.osmo.svc.cluster.local:80" + log_success "In-cluster Agent URL: ${OSMO_SERVICE_URL}" + else + echo "" + log_error "Could not detect OSMO Agent service. Deploy OSMO first: ./03-deploy-osmo-control-plane.sh" + log_error "Note: Backend operators require osmo-agent service for WebSocket connections" + exit 1 + fi + fi +fi + +# Check for OSMO Service Token +if [[ -z "${OSMO_SERVICE_TOKEN:-}" ]]; then + # First, ensure namespace exists so we can check for existing secret + kubectl create namespace "${OSMO_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - 2>/dev/null || true + + # Check if token secret already exists in cluster + EXISTING_TOKEN=$(kubectl get secret osmo-operator-token -n "${OSMO_OPERATOR_NAMESPACE}" -o jsonpath='{.data.token}' 2>/dev/null | base64 -d || echo "") + + if [[ -n "$EXISTING_TOKEN" ]]; then + log_info "Using existing token from secret osmo-operator-token" + OSMO_SERVICE_TOKEN="$EXISTING_TOKEN" + elif command -v osmo &>/dev/null; then + # Check if osmo CLI is already logged in (don't try to login with in-cluster URL) + log_info "Checking if OSMO CLI is already logged in..." + + # Try to generate token - this only works if CLI is already logged in + TOKEN_NAME="backend-token-$(date -u +%Y%m%d%H%M%S)" + EXPIRY_DATE=$(date -u -d "+1 year" +%F 2>/dev/null || date -u -v+1y +%F 2>/dev/null || echo "2027-01-01") + + TOKEN_JSON=$(osmo token set "$TOKEN_NAME" \ + --expires-at "$EXPIRY_DATE" \ + --description "Backend Operator Token" \ + --service --roles osmo-backend -t json 2>/dev/null || echo "") + + if [[ -n "$TOKEN_JSON" ]]; then + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_JSON" | jq -r '.token // empty' 2>/dev/null || echo "") + fi + + if [[ -n "$OSMO_SERVICE_TOKEN" ]]; then + log_success "Service token generated: $TOKEN_NAME (expires: $EXPIRY_DATE)" + fi + fi + + # If still no token, automatically create one using port-forward + if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then + log_info "No token found - automatically creating service token..." + + # Check if osmo CLI is available + if ! command -v osmo &>/dev/null; then + log_error "osmo CLI not found. Please install it first." + exit 1 + fi + + # Start port-forward in background + log_info "Starting port-forward to OSMO service..." + kubectl port-forward -n osmo svc/osmo-service 8080:80 &>/dev/null & + PORT_FORWARD_PID=$! + + # Cleanup function to kill port-forward on exit + cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill $PORT_FORWARD_PID 2>/dev/null || true + wait $PORT_FORWARD_PID 2>/dev/null || true + fi + } + trap cleanup_port_forward EXIT + + # Wait for port-forward to be ready + log_info "Waiting for port-forward to be ready..." + max_wait=30 + elapsed=0 + while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + ((elapsed += 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi + done + log_success "Port-forward ready" + + # Login with dev method (since auth is disabled) + log_info "Logging in to OSMO (dev method)..." + if ! osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then + log_error "Failed to login to OSMO" + exit 1 + fi + log_success "Logged in successfully" + + # Create service token + TOKEN_NAME="backend-token-$(date -u +%Y%m%d%H%M%S)" + EXPIRY_DATE=$(date -u -d "+1 year" +%F 2>/dev/null || date -u -v+1y +%F 2>/dev/null || echo "2027-01-01") + + log_info "Creating service token: $TOKEN_NAME (expires: $EXPIRY_DATE)..." + TOKEN_OUTPUT=$(osmo token set "$TOKEN_NAME" \ + --expires-at "$EXPIRY_DATE" \ + --description "Backend Operator Token (auto-generated)" \ + --service --roles osmo-backend 2>&1) + + # Extract token from output (format: "Access token: ") + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_OUTPUT" | grep -oP 'Access token: \K.*' || echo "") + + if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then + log_error "Failed to create service token" + echo "Output: $TOKEN_OUTPUT" + exit 1 + fi + + log_success "Service token created successfully" + + # Stop port-forward (we're done with it) + cleanup_port_forward + trap - EXIT + fi +fi + +# ----------------------------------------------------------------------------- +# Add OSMO Helm Repository +# ----------------------------------------------------------------------------- +log_info "Adding OSMO Helm repository..." +helm repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo --force-update +helm repo update + +# ----------------------------------------------------------------------------- +# Create Namespaces +# ----------------------------------------------------------------------------- +log_info "Creating namespaces..." +kubectl create namespace "${OSMO_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - +kubectl create namespace "${OSMO_WORKFLOWS_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + +# ----------------------------------------------------------------------------- +# Create Secrets +# ----------------------------------------------------------------------------- +log_info "Creating operator token secret..." +kubectl create secret generic osmo-operator-token \ + --namespace "${OSMO_OPERATOR_NAMESPACE}" \ + --from-literal=token="${OSMO_SERVICE_TOKEN}" \ + --dry-run=client -o yaml | kubectl apply -f - + +# ----------------------------------------------------------------------------- +# Create Values File +# ----------------------------------------------------------------------------- +log_info "Creating Helm values file..." + +# Note: services.backendListener/Worker are at root level, not under global +# See: osmo-helm-charts/backend-operator/values.yaml +cat > /tmp/backend_operator_values.yaml </dev/null || true + +echo "" +echo "========================================" +log_success "OSMO Backend Operator deployment complete!" +echo "========================================" +echo "" +echo "Backend Name: ${BACKEND_NAME}" +echo "Agent URL (WebSocket): ${OSMO_SERVICE_URL}" +echo "" +echo "To verify the backend registration:" +echo "" +echo " Terminal 1 - Start port-forward (keep running):" +echo " kubectl port-forward -n osmo svc/osmo-service 8080:80" +echo "" +echo " Terminal 2 - Check backend status:" +echo " osmo config show BACKEND ${BACKEND_NAME}" +echo "" +echo " Or via curl:" +echo " curl http://localhost:8080/api/configs/backend" +echo "" diff --git a/applications/osmo/deploy/002-setup/05-configure-storage.sh b/applications/osmo/deploy/002-setup/05-configure-storage.sh new file mode 100755 index 000000000..47e7a9d53 --- /dev/null +++ b/applications/osmo/deploy/002-setup/05-configure-storage.sh @@ -0,0 +1,251 @@ +#!/bin/bash +# +# Configure OSMO Storage +# https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/configure_data_storage.html +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" + +echo "" +echo "========================================" +echo " OSMO Storage Configuration" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 + +# ----------------------------------------------------------------------------- +# Get Storage Configuration from Terraform +# ----------------------------------------------------------------------------- +log_info "Retrieving storage configuration from Terraform..." + +S3_BUCKET=$(get_tf_output "storage_bucket.name" "../001-iac" 2>/dev/null || echo "") +S3_ENDPOINT=$(get_tf_output "storage_bucket.endpoint" "../001-iac" 2>/dev/null || echo "") + +# Default endpoint if not set +if [[ -z "$S3_ENDPOINT" ]]; then + S3_ENDPOINT="https://storage.eu-north1.nebius.cloud" +fi + +if [[ -z "$S3_BUCKET" ]]; then + log_error "Could not retrieve storage bucket name from Terraform" + echo "" + echo "Make sure you have run 'terraform apply' in deploy/001-iac" + echo "and that storage is enabled in your terraform.tfvars" + exit 1 +fi + +log_success "Storage bucket: ${S3_BUCKET}" +log_success "Storage endpoint: ${S3_ENDPOINT}" + +# ----------------------------------------------------------------------------- +# Check/Create osmo-storage secret +# ----------------------------------------------------------------------------- +log_info "Checking for osmo-storage secret..." + +if ! kubectl get secret osmo-storage -n osmo &>/dev/null; then + log_warning "osmo-storage secret not found - attempting to create from MysteryBox..." + + # Get credentials from Terraform/MysteryBox + S3_ACCESS_KEY=$(get_tf_output "storage_credentials.access_key_id" "../001-iac" 2>/dev/null || echo "") + S3_SECRET_REF_ID=$(get_tf_output "storage_secret_reference_id" "../001-iac" 2>/dev/null || echo "") + S3_SECRET_KEY="" + + if [[ -n "$S3_SECRET_REF_ID" ]]; then + log_info "Retrieving storage secret from MysteryBox..." + # IAM access key secrets are stored with key "secret" in MysteryBox + S3_SECRET_KEY=$(get_mysterybox_secret "$S3_SECRET_REF_ID" "secret" 2>/dev/null || echo "") + fi + + if [[ -z "$S3_ACCESS_KEY" || -z "$S3_SECRET_KEY" ]]; then + log_error "Could not retrieve storage credentials" + echo "" + echo "Either re-run 03-deploy-osmo-control-plane.sh or create the secret manually:" + echo "" + echo " kubectl create secret generic osmo-storage \\" + echo " --namespace osmo \\" + echo " --from-literal=access-key-id= \\" + echo " --from-literal=secret-access-key=" + exit 1 + fi + + # Create the secret + kubectl create secret generic osmo-storage \ + --namespace osmo \ + --from-literal=access-key-id="${S3_ACCESS_KEY}" \ + --from-literal=secret-access-key="${S3_SECRET_KEY}" \ + --dry-run=client -o yaml | kubectl apply -f - + + log_success "osmo-storage secret created" +else + log_success "osmo-storage secret exists" +fi + +# ----------------------------------------------------------------------------- +# Start port-forward and configure storage +# ----------------------------------------------------------------------------- +log_info "Starting port-forward to OSMO service..." + +# Start port-forward in background +kubectl port-forward -n osmo svc/osmo-service 8080:80 &>/dev/null & +PORT_FORWARD_PID=$! + +# Cleanup function +cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill $PORT_FORWARD_PID 2>/dev/null || true + wait $PORT_FORWARD_PID 2>/dev/null || true + fi +} +trap cleanup_port_forward EXIT + +# Wait for port-forward to be ready +log_info "Waiting for port-forward to be ready..." +max_wait=30 +elapsed=0 +while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + ((elapsed += 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi +done +log_success "Port-forward ready" + +# Login with dev method +log_info "Logging in to OSMO..." +if ! osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then + log_error "Failed to login to OSMO" + exit 1 +fi +log_success "Logged in successfully" + +# ----------------------------------------------------------------------------- +# Get Storage Credentials +# ----------------------------------------------------------------------------- +log_info "Retrieving storage credentials..." + +# Get access key from Terraform +S3_ACCESS_KEY=$(get_tf_output "storage_credentials.access_key_id" "../001-iac" 2>/dev/null || echo "") + +# Get secret key from osmo-storage secret (already created) +S3_SECRET_KEY=$(kubectl get secret osmo-storage -n osmo -o jsonpath='{.data.secret-access-key}' 2>/dev/null | base64 -d 2>/dev/null || echo "") + +if [[ -z "$S3_ACCESS_KEY" || -z "$S3_SECRET_KEY" ]]; then + log_error "Could not retrieve storage credentials" + exit 1 +fi + +# Nebius Object Storage uses S3-compatible API +# OSMO uses TOS (Torch Object Storage) scheme for S3-compatible storage with custom endpoints +# Format: tos:/// +S3_HOST=$(echo "$S3_ENDPOINT" | sed 's|https://||') +BACKEND_URI="tos://${S3_HOST}/${S3_BUCKET}" +REGION="eu-north1" + +log_success "Storage credentials retrieved" + +# ----------------------------------------------------------------------------- +# Configure Workflow Log Storage in OSMO +# ----------------------------------------------------------------------------- +log_info "Configuring workflow log storage..." + +# Create workflow log config JSON +WORKFLOW_LOG_CONFIG=$(cat < /tmp/workflow_log_config.json + +# Use EDITOR='tee' trick to bypass interactive editor +if echo 'Configure workflow log storage' | EDITOR='tee' osmo config update WORKFLOW --file /tmp/workflow_log_config.json 2>/dev/null; then + log_success "Workflow log storage configured" +else + log_error "Failed to configure workflow log storage" + rm -f /tmp/workflow_log_config.json + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Configure Workflow Data Storage in OSMO +# ----------------------------------------------------------------------------- +log_info "Configuring workflow data storage..." + +# Create workflow data config JSON +WORKFLOW_DATA_CONFIG=$(cat < /tmp/workflow_data_config.json + +# Use EDITOR='tee' trick to bypass interactive editor +if echo 'Configure workflow data storage' | EDITOR='tee' osmo config update WORKFLOW --file /tmp/workflow_data_config.json 2>/dev/null; then + log_success "Workflow data storage configured" +else + log_error "Failed to configure workflow data storage" + rm -f /tmp/workflow_log_config.json /tmp/workflow_data_config.json + exit 1 +fi + +# Cleanup temp files +rm -f /tmp/workflow_log_config.json /tmp/workflow_data_config.json + +# ----------------------------------------------------------------------------- +# Verify Configuration +# ----------------------------------------------------------------------------- +log_info "Verifying storage configuration..." + +echo "" +echo "Workflow configuration:" +osmo config show WORKFLOW 2>/dev/null || \ + curl -s "http://localhost:8080/api/configs/workflow" 2>/dev/null | jq '.' || \ + log_warning "Could not retrieve workflow config for verification" + +# Cleanup +cleanup_port_forward +trap - EXIT + +echo "" +echo "========================================" +log_success "OSMO Storage configuration complete!" +echo "========================================" +echo "" +echo "Storage Details:" +echo " Bucket: ${S3_BUCKET}" +echo " Endpoint: ${S3_ENDPOINT}" +echo " Backend URI: ${BACKEND_URI}" +echo " Region: ${REGION}" +echo "" +echo "Configured:" +echo " - workflow_log: For storing workflow logs" +echo " - workflow_data: For storing intermediate task data" +echo "" +echo "OSMO can now store workflow artifacts in Nebius Object Storage." +echo "" diff --git a/applications/osmo/deploy/002-setup/06-configure-service-url.sh b/applications/osmo/deploy/002-setup/06-configure-service-url.sh new file mode 100755 index 000000000..76c4ee481 --- /dev/null +++ b/applications/osmo/deploy/002-setup/06-configure-service-url.sh @@ -0,0 +1,129 @@ +#!/bin/bash +# +# Configure OSMO Service URL +# Required for osmo-ctrl sidecar to communicate with OSMO service +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" + +echo "" +echo "========================================" +echo " OSMO Service URL Configuration" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 + +# ----------------------------------------------------------------------------- +# Start port-forward +# ----------------------------------------------------------------------------- +log_info "Starting port-forward to OSMO service..." + +kubectl port-forward -n osmo svc/osmo-service 8080:80 &>/dev/null & +PORT_FORWARD_PID=$! + +cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill $PORT_FORWARD_PID 2>/dev/null || true + wait $PORT_FORWARD_PID 2>/dev/null || true + fi +} +trap cleanup_port_forward EXIT + +# Wait for port-forward to be ready +log_info "Waiting for port-forward to be ready..." +max_wait=30 +elapsed=0 +while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + ((elapsed += 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi +done +log_success "Port-forward ready" + +# Login +log_info "Logging in to OSMO..." +if ! osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then + log_error "Failed to login to OSMO" + exit 1 +fi +log_success "Logged in successfully" + +# ----------------------------------------------------------------------------- +# Check current service_base_url +# ----------------------------------------------------------------------------- +log_info "Checking current service_base_url..." + +CURRENT_URL=$(curl -s "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') +echo "Current service_base_url: '${CURRENT_URL}'" + +if [[ -n "$CURRENT_URL" && "$CURRENT_URL" != "null" ]]; then + log_success "service_base_url is already configured: ${CURRENT_URL}" + echo "" + echo "To reconfigure, delete the current value first or update manually." + cleanup_port_forward + trap - EXIT + exit 0 +fi + +# ----------------------------------------------------------------------------- +# Configure service_base_url +# ----------------------------------------------------------------------------- +log_info "Configuring service_base_url..." + +# The osmo-ctrl sidecar needs to connect to the OSMO service via the proxy +SERVICE_URL="http://osmo-proxy.osmo.svc.cluster.local:80" + +cat > /tmp/service_url_fix.json << EOF +{ + "service_base_url": "${SERVICE_URL}" +} +EOF + +if echo 'Configure service URL' | EDITOR='tee' osmo config update SERVICE --file /tmp/service_url_fix.json 2>/dev/null; then + log_success "service_base_url configured" +else + log_error "Failed to configure service_base_url" + rm -f /tmp/service_url_fix.json + exit 1 +fi + +rm -f /tmp/service_url_fix.json + +# ----------------------------------------------------------------------------- +# Verify Configuration +# ----------------------------------------------------------------------------- +log_info "Verifying configuration..." + +NEW_URL=$(curl -s "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') + +if [[ "$NEW_URL" == "$SERVICE_URL" ]]; then + log_success "service_base_url verified: ${NEW_URL}" +else + log_error "Verification failed. Expected: ${SERVICE_URL}, Got: ${NEW_URL}" + exit 1 +fi + +# Cleanup +cleanup_port_forward +trap - EXIT + +echo "" +echo "========================================" +log_success "OSMO Service URL configuration complete!" +echo "========================================" +echo "" +echo "Service URL: ${SERVICE_URL}" +echo "" +echo "This URL is used by the osmo-ctrl sidecar container to:" +echo " - Stream workflow logs to the OSMO service" +echo " - Report task status and completion" +echo " - Fetch authentication tokens" +echo "" diff --git a/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh b/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh new file mode 100755 index 000000000..aa371b74f --- /dev/null +++ b/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh @@ -0,0 +1,129 @@ +#!/bin/bash +# Configure OSMO GPU platform with tolerations via pod templates +# Based on OSMO documentation: https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/resource_pools.html + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" + +OSMO_URL="${OSMO_URL:-http://localhost:8080}" +OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" + +echo "" +echo "========================================" +echo " OSMO GPU Platform Configuration" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 + +# ----------------------------------------------------------------------------- +# Start port-forward +# ----------------------------------------------------------------------------- +log_info "Starting port-forward to OSMO service..." + +kubectl port-forward -n "${OSMO_NAMESPACE}" svc/osmo-service 8080:80 &>/dev/null & +PORT_FORWARD_PID=$! + +cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill $PORT_FORWARD_PID 2>/dev/null || true + wait $PORT_FORWARD_PID 2>/dev/null || true + fi +} +trap cleanup_port_forward EXIT + +# Wait for port-forward to be ready +log_info "Waiting for port-forward to be ready..." +max_wait=30 +elapsed=0 +while ! curl -s -o /dev/null -w "%{http_code}" "${OSMO_URL}/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + ((elapsed += 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi +done +log_success "Port-forward ready" + +# ----------------------------------------------------------------------------- +# Step 1: Create GPU pod template +# ----------------------------------------------------------------------------- +log_info "Creating gpu_tolerations pod template..." + +RESPONSE=$(curl -s -w "\n%{http_code}" -X PUT \ + "${OSMO_URL}/api/configs/pod_template/gpu_tolerations" \ + -H "Content-Type: application/json" \ + -d @"${SCRIPT_DIR}/gpu_pod_template.json") +HTTP_CODE=$(echo "$RESPONSE" | tail -n1) +BODY=$(echo "$RESPONSE" | sed '$d') + +if [[ "$HTTP_CODE" -ge 200 && "$HTTP_CODE" -lt 300 ]]; then + log_success "Pod template created (HTTP ${HTTP_CODE})" +else + log_error "Failed to create pod template (HTTP ${HTTP_CODE})" + echo "Response: ${BODY}" + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Step 2: Create GPU platform +# ----------------------------------------------------------------------------- +log_info "Creating gpu platform in default pool..." + +RESPONSE=$(curl -s -w "\n%{http_code}" -X PUT \ + "${OSMO_URL}/api/configs/pool/default/platform/gpu" \ + -H "Content-Type: application/json" \ + -d @"${SCRIPT_DIR}/gpu_platform_update.json") +HTTP_CODE=$(echo "$RESPONSE" | tail -n1) +BODY=$(echo "$RESPONSE" | sed '$d') + +if [[ "$HTTP_CODE" -ge 200 && "$HTTP_CODE" -lt 300 ]]; then + log_success "GPU platform created (HTTP ${HTTP_CODE})" +else + log_error "Failed to create GPU platform (HTTP ${HTTP_CODE})" + echo "Response: ${BODY}" + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Step 3: Verify configuration +# ----------------------------------------------------------------------------- +log_info "Verifying configuration..." + +echo "" +echo "Pod templates:" +curl -s "${OSMO_URL}/api/configs/pod_template" | jq 'keys' + +echo "" +echo "GPU platform config:" +curl -s "${OSMO_URL}/api/configs/pool/default" | jq '.platforms.gpu' + +# ----------------------------------------------------------------------------- +# Step 4: Check GPU resources +# ----------------------------------------------------------------------------- +log_info "Checking GPU resources..." +sleep 3 # Wait for backend to pick up changes + +RESOURCE_COUNT=$(curl -s "${OSMO_URL}/api/resources" | jq '[.resources[] | select(.allocatable_fields.gpu != null)] | length') +echo "GPU nodes visible to OSMO: ${RESOURCE_COUNT}" + +if [[ "$RESOURCE_COUNT" -gt 0 ]]; then + echo "" + echo "GPU resources:" + curl -s "${OSMO_URL}/api/resources" | jq '.resources[] | select(.allocatable_fields.gpu != null) | {name: .name, gpu: .allocatable_fields.gpu, cpu: .allocatable_fields.cpu, memory: .allocatable_fields.memory}' +fi + +# ----------------------------------------------------------------------------- +# Done +# ----------------------------------------------------------------------------- +log_success "GPU platform configuration complete" +echo "" +echo "To submit a GPU workflow:" +echo " osmo workflow submit workflows/osmo/gpu_test.yaml" +echo "" +echo "Or test via curl:" +echo " curl -X POST ${OSMO_URL}/api/workflow -H 'Content-Type: application/yaml' --data-binary @workflows/osmo/gpu_test.yaml" diff --git a/applications/osmo/deploy/002-setup/README.md b/applications/osmo/deploy/002-setup/README.md new file mode 100755 index 000000000..fab486567 --- /dev/null +++ b/applications/osmo/deploy/002-setup/README.md @@ -0,0 +1,358 @@ +# Kubernetes Setup Scripts + +This directory contains scripts for configuring the Kubernetes cluster with GPU infrastructure and OSMO components. + +## Prerequisites + +1. Complete infrastructure deployment (001-iac) +2. kubectl configured with cluster access: + ```bash + nebius mk8s cluster get-credentials --id --external + ``` + +## Deployment Order + +Run scripts in order: + +```bash +# 1. GPU Infrastructure (GPU Operator, Network Operator, KAI Scheduler) +./01-deploy-gpu-infrastructure.sh + +# 2. Observability (Prometheus, Grafana, Loki) +./02-deploy-observability.sh + +# 3. OSMO Control Plane +./03-deploy-osmo-control-plane.sh + +# 4. OSMO Backend +./04-deploy-osmo-backend.sh + +# 5. Configure Storage (requires port-forward, see main README) +./05-configure-storage.sh + +# 6. Configure Service URL (required for workflows) +./06-configure-service-url.sh + +# 7. Configure GPU Platform (required for GPU workflows) +./07-configure-gpu-platform.sh +``` + +## Scripts + +| Script | Purpose | Duration | +|--------|---------|----------| +| `01-deploy-gpu-infrastructure.sh` | GPU Operator, Network Operator, KAI Scheduler | ~15 min | +| `02-deploy-observability.sh` | Prometheus, Grafana, Loki, Promtail | ~10 min | +| `03-deploy-osmo-control-plane.sh` | OSMO Control Plane, nginx proxy, database secrets | ~5 min | +| `04-deploy-osmo-backend.sh` | OSMO Backend operator | ~5 min | +| `05-configure-storage.sh` | Configure S3-compatible storage for workflow logs/data | ~1 min | +| `06-configure-service-url.sh` | Configure service URL for osmo-ctrl sidecar | ~1 min | +| `07-configure-gpu-platform.sh` | Configure GPU platform with tolerations/node selector | ~1 min | + +## Configuration + +### Helm Values + +Customize deployments by editing files in `values/`: + +| File | Component | +|------|-----------| +| `gpu-operator.yaml` | NVIDIA GPU Operator | +| `network-operator.yaml` | NVIDIA Network Operator | +| `kai-scheduler.yaml` | KAI GPU Scheduler | +| `prometheus.yaml` | Prometheus + Grafana | +| `loki.yaml` | Loki Log Aggregation | +| `promtail.yaml` | Log Collection | + +### Environment Variables + +Configure via `defaults.sh`: + +```bash +# Namespaces +GPU_OPERATOR_NAMESPACE="gpu-operator" +NETWORK_OPERATOR_NAMESPACE="network-operator" +MONITORING_NAMESPACE="monitoring" +OSMO_NAMESPACE="osmo" + +# Grafana password (auto-generated if empty) +GRAFANA_ADMIN_PASSWORD="" +``` + +### Secrets from MysteryBox + +If you ran `secrets-init.sh` in the prerequisites step, the following environment variables are set: + +| Variable | Description | +|----------|-------------| +| `TF_VAR_postgresql_mysterybox_secret_id` | MysteryBox secret ID for PostgreSQL password | +| `TF_VAR_mek_mysterybox_secret_id` | MysteryBox secret ID for MEK (Master Encryption Key) | + +The `03-deploy-osmo-control-plane.sh` script automatically reads these secrets from MysteryBox. This keeps sensitive credentials out of Terraform state and provides a secure secrets management workflow. + +**Secret retrieval order:** +1. **MysteryBox** (if secret ID is set via `TF_VAR_*` or `OSMO_*` env vars) +2. **Terraform outputs** (fallback) +3. **Environment variables** (fallback) +4. **Interactive prompt** (last resort) + +To manually retrieve secrets from MysteryBox: +```bash +# PostgreSQL password +nebius mysterybox v1 payload get-by-key \ + --secret-id $TF_VAR_postgresql_mysterybox_secret_id \ + --key password --format json | jq -r '.data.string_value' + +# MEK (Master Encryption Key) +nebius mysterybox v1 payload get-by-key \ + --secret-id $TF_VAR_mek_mysterybox_secret_id \ + --key mek --format json | jq -r '.data.string_value' +``` + +## Accessing Services + +### Grafana Dashboard + +```bash +kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80 +# Open http://localhost:3000 +# User: admin +# Password: (shown during deployment or in defaults.sh) +``` + +### Prometheus + +```bash +kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 +# Open http://localhost:9090 +``` + +### OSMO API + +```bash +kubectl port-forward -n osmo svc/osmo-service 8080:80 +# Open http://localhost:8080 +``` + +### OSMO Web UI + +```bash +kubectl port-forward -n osmo svc/osmo-ui 8081:80 +# Open http://localhost:8081 +``` + +## Cleanup + +Run cleanup scripts in reverse order: + +```bash +cd cleanup + +# Remove OSMO +./uninstall-osmo-backend.sh +./uninstall-osmo-control-plane.sh + +# Remove observability +./uninstall-observability.sh + +# Remove GPU infrastructure +./uninstall-gpu-infrastructure.sh +``` + +## Configure OSMO GPU Platform + +After deploying OSMO backend, configure the GPU platform so OSMO can schedule workloads on GPU nodes. + +### Why is this needed? + +Nebius GPU nodes have a taint `nvidia.com/gpu=true:NoSchedule` that prevents pods from being scheduled unless they have matching tolerations. OSMO needs to be configured with: + +1. A **pod template** with GPU tolerations and node selector +2. A **GPU platform** that references this pod template + +### Option 1: Run the Configuration Script (Recommended) + +```bash +./07-configure-gpu-platform.sh +``` + +### Option 2: Manual Configuration via API + +With port-forward running (`kubectl port-forward -n osmo svc/osmo-service 8080:80`): + +**Step 1: Create GPU Pod Template** + +```bash +curl -X PUT 'http://localhost:8080/api/configs/pod_template/gpu_tolerations' \ + -H 'Content-Type: application/json' \ + -d @gpu_pod_template.json +``` + +Where `gpu_pod_template.json` contains: + +```json +{ + "configs": { + "spec": { + "tolerations": [ + { + "key": "nvidia.com/gpu", + "operator": "Exists", + "effect": "NoSchedule" + } + ], + "nodeSelector": { + "nvidia.com/gpu.present": "true" + } + } + } +} +``` + +**Step 2: Create GPU Platform** + +```bash +curl -X PUT 'http://localhost:8080/api/configs/pool/default/platform/gpu' \ + -H 'Content-Type: application/json' \ + -d @gpu_platform_update.json +``` + +Where `gpu_platform_update.json` contains: + +```json +{ + "configs": { + "description": "GPU platform for L40S nodes", + "host_network_allowed": false, + "privileged_allowed": false, + "allowed_mounts": [], + "default_mounts": [], + "default_variables": { + "USER_GPU": 1 + }, + "resource_validations": [], + "override_pod_template": ["gpu_tolerations"] + } +} +``` + +### Verify Configuration + +```bash +# Check pod templates +curl -s http://localhost:8080/api/configs/pod_template | jq 'keys' +# Should include: "gpu_tolerations" + +# Check GPU platform +curl -s http://localhost:8080/api/configs/pool/default | jq '.platforms.gpu' + +# Check resources (GPU nodes should now be visible) +curl -s http://localhost:8080/api/resources | jq '.resources[] | {name: .name, gpu: .allocatable_fields.gpu}' +``` + +### Using GPU in Workflows + +Specify `platform: gpu` in your OSMO workflow: + +```yaml +workflow: + name: my-gpu-job + resources: + gpu-resource: + platform: gpu # <-- Selects GPU platform with tolerations + gpu: 1 + memory: 4Gi + tasks: + - name: train + image: nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04 + command: ["nvidia-smi"] + resource: gpu-resource +``` + +## Troubleshooting + +### GPU Nodes Not Ready + +1. Check GPU operator pods: + ```bash + kubectl get pods -n gpu-operator + ``` + +2. Check node labels: + ```bash + kubectl get nodes -l node-type=gpu --show-labels + ``` + +3. Check DCGM exporter: + ```bash + kubectl logs -n gpu-operator -l app=nvidia-dcgm-exporter + ``` + +### Pods Pending on GPU Nodes + +1. Verify tolerations: + ```bash + kubectl describe pod | grep -A5 Tolerations + ``` + +2. Check node taints: + ```bash + kubectl describe node | grep Taints + ``` + +### InfiniBand Issues + +1. Check Network Operator: + ```bash + kubectl get pods -n network-operator + ``` + +2. Verify RDMA devices: + ```bash + kubectl exec -n gpu-operator -- ibstat + ``` + +### Database Connection Failed + +1. Verify PostgreSQL is accessible: + ```bash + kubectl get secret osmo-database -n osmo -o yaml + ``` + +2. Test connection from a pod: + ```bash + kubectl run pg-test --rm -it --image=postgres:16 -- psql -h -U -d + ``` + +### OSMO Not Seeing GPU Resources + +If OSMO shows 0 GPUs or GPU workflows fail to schedule: + +1. Check if GPU platform is configured: + ```bash + curl -s http://localhost:8080/api/configs/pool/default | jq '.platforms | keys' + # Should include "gpu" + ``` + +2. Check if GPU pod template exists: + ```bash + curl -s http://localhost:8080/api/configs/pod_template | jq 'keys' + # Should include "gpu_tolerations" + ``` + +3. Check GPU node labels and taints: + ```bash + kubectl describe node | grep -E 'Taints:|nvidia.com/gpu' + # Should show taint: nvidia.com/gpu=true:NoSchedule + # Should show label: nvidia.com/gpu.present=true + ``` + +4. If missing, run the GPU configuration: + ```bash + ./07-configure-gpu-platform.sh + ``` + +5. Verify OSMO sees GPU resources: + ```bash + curl -s http://localhost:8080/api/resources | jq '.resources[] | select(.allocatable_fields.gpu != null)' + ``` diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-gpu-infrastructure.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-gpu-infrastructure.sh new file mode 100755 index 000000000..446217ada --- /dev/null +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-gpu-infrastructure.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# Uninstall GPU Infrastructure +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/../lib/common.sh" +source "${SCRIPT_DIR}/../defaults.sh" + +echo "" +echo "========================================" +echo " Uninstalling GPU Infrastructure" +echo "========================================" +echo "" + +log_warning "This will remove GPU Operator, Network Operator, and KAI Scheduler" +read -p "Continue? (y/N): " confirm +if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "Cancelled" + exit 0 +fi + +log_info "Removing KAI Scheduler..." +helm uninstall kai-scheduler -n "${KAI_SCHEDULER_NAMESPACE}" 2>/dev/null || true +kubectl delete namespace "${KAI_SCHEDULER_NAMESPACE}" --ignore-not-found + +log_info "Removing Network Operator..." +helm uninstall network-operator -n "${NETWORK_OPERATOR_NAMESPACE}" 2>/dev/null || true +kubectl delete namespace "${NETWORK_OPERATOR_NAMESPACE}" --ignore-not-found + +log_info "Removing GPU Operator..." +helm uninstall gpu-operator -n "${GPU_OPERATOR_NAMESPACE}" 2>/dev/null || true + +# Remove GPU Operator CRDs +log_info "Removing GPU Operator CRDs..." +kubectl delete crd clusterpolicies.nvidia.com --ignore-not-found +kubectl delete crd nvidiadrivers.nvidia.com --ignore-not-found + +kubectl delete namespace "${GPU_OPERATOR_NAMESPACE}" --ignore-not-found + +log_success "GPU infrastructure uninstalled" diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-observability.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-observability.sh new file mode 100755 index 000000000..4ba162619 --- /dev/null +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-observability.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# +# Uninstall Observability Stack +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/../lib/common.sh" +source "${SCRIPT_DIR}/../defaults.sh" + +echo "" +echo "========================================" +echo " Uninstalling Observability Stack" +echo "========================================" +echo "" + +log_warning "This will remove Prometheus, Grafana, and Loki" +read -p "Continue? (y/N): " confirm +if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "Cancelled" + exit 0 +fi + +log_info "Removing Promtail..." +helm uninstall promtail -n "${MONITORING_NAMESPACE}" 2>/dev/null || true + +log_info "Removing Loki..." +helm uninstall loki -n "${MONITORING_NAMESPACE}" 2>/dev/null || true + +log_info "Removing Prometheus stack..." +helm uninstall prometheus -n "${MONITORING_NAMESPACE}" 2>/dev/null || true + +# Remove CRDs +log_info "Removing Prometheus CRDs..." +kubectl delete crd alertmanagerconfigs.monitoring.coreos.com --ignore-not-found +kubectl delete crd alertmanagers.monitoring.coreos.com --ignore-not-found +kubectl delete crd podmonitors.monitoring.coreos.com --ignore-not-found +kubectl delete crd probes.monitoring.coreos.com --ignore-not-found +kubectl delete crd prometheuses.monitoring.coreos.com --ignore-not-found +kubectl delete crd prometheusrules.monitoring.coreos.com --ignore-not-found +kubectl delete crd servicemonitors.monitoring.coreos.com --ignore-not-found +kubectl delete crd thanosrulers.monitoring.coreos.com --ignore-not-found + +log_info "Removing monitoring namespace..." +kubectl delete namespace "${MONITORING_NAMESPACE}" --ignore-not-found + +log_success "Observability stack uninstalled" diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh new file mode 100755 index 000000000..9967324f8 --- /dev/null +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# +# Uninstall OSMO Backend +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/../lib/common.sh" +source "${SCRIPT_DIR}/../defaults.sh" + +echo "" +echo "========================================" +echo " Uninstalling OSMO Backend" +echo "========================================" +echo "" + +log_warning "This will remove OSMO Backend services" +read -p "Continue? (y/N): " confirm +if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "Cancelled" + exit 0 +fi + +log_info "Removing OSMO Backend..." +kubectl delete deployment osmo-backend -n "${OSMO_NAMESPACE}" --ignore-not-found +kubectl delete service osmo-backend -n "${OSMO_NAMESPACE}" --ignore-not-found +kubectl delete service osmo-api -n "${OSMO_NAMESPACE}" --ignore-not-found + +log_success "OSMO Backend uninstalled" diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-control-plane.sh new file mode 100755 index 000000000..63e44420b --- /dev/null +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-control-plane.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# +# Uninstall OSMO Control Plane +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/../lib/common.sh" +source "${SCRIPT_DIR}/../defaults.sh" + +echo "" +echo "========================================" +echo " Uninstalling OSMO Control Plane" +echo "========================================" +echo "" + +log_warning "This will remove OSMO Control Plane and all OSMO resources" +read -p "Continue? (y/N): " confirm +if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "Cancelled" + exit 0 +fi + +log_info "Removing OSMO Control Plane..." +kubectl delete deployment osmo-control-plane -n "${OSMO_NAMESPACE}" --ignore-not-found +kubectl delete service osmo-control-plane -n "${OSMO_NAMESPACE}" --ignore-not-found +kubectl delete secret osmo-database -n "${OSMO_NAMESPACE}" --ignore-not-found +kubectl delete secret osmo-storage -n "${OSMO_NAMESPACE}" --ignore-not-found + +log_info "Removing OSMO namespace..." +kubectl delete namespace "${OSMO_NAMESPACE}" --ignore-not-found + +log_success "OSMO Control Plane uninstalled" diff --git a/applications/osmo/deploy/002-setup/defaults.sh b/applications/osmo/deploy/002-setup/defaults.sh new file mode 100755 index 000000000..14aae0071 --- /dev/null +++ b/applications/osmo/deploy/002-setup/defaults.sh @@ -0,0 +1,37 @@ +# ============================================================================= +# Default Configuration for Setup Scripts +# ============================================================================= + +# Namespaces +export GPU_OPERATOR_NAMESPACE="gpu-operator" +export NETWORK_OPERATOR_NAMESPACE="network-operator" +export KAI_SCHEDULER_NAMESPACE="kai-scheduler" +export MONITORING_NAMESPACE="monitoring" +export OSMO_NAMESPACE="osmo" + +# Chart versions (leave empty for latest) +export GPU_OPERATOR_VERSION="" +export NETWORK_OPERATOR_VERSION="" +export KAI_SCHEDULER_VERSION="v0.12.4" # Check https://github.com/NVIDIA/KAI-Scheduler/releases +export PROMETHEUS_VERSION="" +export GRAFANA_VERSION="" +export LOKI_VERSION="" + +# GPU Operator settings +export GPU_DRIVER_ENABLED="false" # Use Nebius driver-full images +export TOOLKIT_ENABLED="true" +export DEVICE_PLUGIN_ENABLED="true" +export MIG_MANAGER_ENABLED="false" + +# Network Operator (only needed for InfiniBand/GPU clusters) +export ENABLE_NETWORK_OPERATOR="false" # Set to "true" if using InfiniBand + +# Observability settings +export PROMETHEUS_RETENTION_DAYS="15" +export LOKI_RETENTION_DAYS="7" +export GRAFANA_ADMIN_PASSWORD="" # Auto-generated if empty + +# Paths +export SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +export VALUES_DIR="${SCRIPT_DIR}/values" +export LIB_DIR="${SCRIPT_DIR}/lib" diff --git a/applications/osmo/deploy/002-setup/gpu_platform_update.json b/applications/osmo/deploy/002-setup/gpu_platform_update.json new file mode 100755 index 000000000..56c0764fe --- /dev/null +++ b/applications/osmo/deploy/002-setup/gpu_platform_update.json @@ -0,0 +1,14 @@ +{ + "configs": { + "description": "GPU platform for L40S nodes", + "host_network_allowed": false, + "privileged_allowed": false, + "allowed_mounts": [], + "default_mounts": [], + "default_variables": { + "USER_GPU": 1 + }, + "resource_validations": [], + "override_pod_template": ["gpu_tolerations"] + } +} diff --git a/applications/osmo/deploy/002-setup/gpu_pod_template.json b/applications/osmo/deploy/002-setup/gpu_pod_template.json new file mode 100755 index 000000000..ae651e3ba --- /dev/null +++ b/applications/osmo/deploy/002-setup/gpu_pod_template.json @@ -0,0 +1,16 @@ +{ + "configs": { + "spec": { + "tolerations": [ + { + "key": "nvidia.com/gpu", + "operator": "Exists", + "effect": "NoSchedule" + } + ], + "nodeSelector": { + "nvidia.com/gpu.present": "true" + } + } + } +} diff --git a/applications/osmo/deploy/002-setup/lib/common.sh b/applications/osmo/deploy/002-setup/lib/common.sh new file mode 100755 index 000000000..bc90c60c1 --- /dev/null +++ b/applications/osmo/deploy/002-setup/lib/common.sh @@ -0,0 +1,199 @@ +#!/bin/bash +# +# Common functions for setup scripts +# + +# Colors +export RED='\033[0;31m' +export GREEN='\033[0;32m' +export YELLOW='\033[1;33m' +export BLUE='\033[0;34m' +export NC='\033[0m' + +# Logging functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[✓]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[!]${NC} $1" +} + +log_error() { + echo -e "${RED}[✗]${NC} $1" +} + +# Check if command exists +check_command() { + command -v "$1" &>/dev/null +} + +# Retry with exponential backoff +retry_with_backoff() { + local max_attempts=${1:-5} + local delay=${2:-2} + local max_delay=${3:-60} + shift 3 + local cmd=("$@") + + local attempt=1 + while [[ $attempt -le $max_attempts ]]; do + log_info "Attempt $attempt/$max_attempts: ${cmd[*]}" + if "${cmd[@]}"; then + return 0 + fi + + if [[ $attempt -lt $max_attempts ]]; then + log_warning "Failed, retrying in ${delay}s..." + sleep "$delay" + delay=$((delay * 2)) + if [[ $delay -gt $max_delay ]]; then + delay=$max_delay + fi + fi + ((attempt++)) + done + + log_error "All $max_attempts attempts failed" + return 1 +} + +# Wait for a condition with timeout +wait_for_condition() { + local description=$1 + local timeout=${2:-300} + local interval=${3:-10} + shift 3 + local cmd=("$@") + + log_info "Waiting for $description (timeout: ${timeout}s)..." + + local elapsed=0 + while [[ $elapsed -lt $timeout ]]; do + if "${cmd[@]}" &>/dev/null; then + log_success "$description" + return 0 + fi + sleep "$interval" + ((elapsed += interval)) + echo -n "." + done + + echo "" + log_error "Timeout waiting for $description" + return 1 +} + +# Check kubectl connection +check_kubectl() { + if ! check_command kubectl; then + log_error "kubectl not found" + return 1 + fi + + if ! kubectl cluster-info &>/dev/null; then + log_error "Cannot connect to Kubernetes cluster" + return 1 + fi + + log_success "kubectl connected to cluster" + return 0 +} + +# Check Helm +check_helm() { + if ! check_command helm; then + log_error "helm not found" + return 1 + fi + + log_success "helm available" + return 0 +} + +# Install Helm chart with retry +helm_install() { + local name=$1 + local chart=$2 + local namespace=$3 + shift 3 + local extra_args=("$@") + + log_info "Installing Helm chart: $name" + + kubectl create namespace "$namespace" --dry-run=client -o yaml | kubectl apply -f - + + retry_with_backoff 3 5 30 helm upgrade --install "$name" "$chart" \ + --namespace "$namespace" \ + --wait --timeout 10m \ + "${extra_args[@]}" +} + +# Wait for pods to be ready +wait_for_pods() { + local namespace=$1 + local label_selector=$2 + local timeout=${3:-300} + + wait_for_condition "pods with label $label_selector in $namespace" \ + "$timeout" 10 \ + kubectl wait --for=condition=Ready pods \ + -n "$namespace" \ + -l "$label_selector" \ + --timeout=10s +} + +# Get Terraform output (supports nested values like "postgresql.host") +get_tf_output() { + local name=$1 + local tf_dir=${2:-../001-iac} + + # Check if name contains a dot (nested value) + if [[ "$name" == *.* ]]; then + local base_name="${name%%.*}" + local key="${name#*.}" + terraform -chdir="$tf_dir" output -json "$base_name" 2>/dev/null | jq -r ".$key // empty" + else + terraform -chdir="$tf_dir" output -json "$name" 2>/dev/null | jq -r '. // empty' + fi +} + +# Get Nebius CLI path +get_nebius_path() { + if command -v nebius &>/dev/null; then + command -v nebius + elif [[ -x "$HOME/.nebius/bin/nebius" ]]; then + echo "$HOME/.nebius/bin/nebius" + fi +} + +# Read secret from Nebius MysteryBox +# Usage: get_mysterybox_secret +# Returns the secret value or empty string if not found +get_mysterybox_secret() { + local secret_id=$1 + local key=$2 + local nebius_path=$(get_nebius_path) + + if [[ -z "$nebius_path" ]]; then + log_warning "Nebius CLI not found, cannot read from MysteryBox" + return 1 + fi + + if [[ -z "$secret_id" ]]; then + return 1 + fi + + local result=$("$nebius_path" mysterybox v1 payload get-by-key \ + --secret-id "$secret_id" \ + --key "$key" \ + --format json 2>/dev/null) + + if [[ -n "$result" ]]; then + echo "$result" | jq -r '.data.string_value // empty' 2>/dev/null + fi +} diff --git a/applications/osmo/deploy/002-setup/nginx-proxy.yaml b/applications/osmo/deploy/002-setup/nginx-proxy.yaml new file mode 100755 index 000000000..b8eab7837 --- /dev/null +++ b/applications/osmo/deploy/002-setup/nginx-proxy.yaml @@ -0,0 +1,120 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: osmo-proxy-nginx-config + namespace: osmo +data: + nginx.conf: | + events { + worker_connections 1024; + } + + http { + # Logging + access_log /dev/stdout; + error_log /dev/stderr; + + # Conditional WebSocket support + # Sets Connection header to "upgrade" for WebSocket requests, "close" otherwise + # This is important for proper handling of both WebSocket and regular HTTP requests + map $http_upgrade $connection_upgrade { + default upgrade; + '' close; + } + + # Upstream servers + upstream osmo-service { + server osmo-service.osmo.svc.cluster.local:80; + } + + upstream osmo-logger { + server osmo-logger.osmo.svc.cluster.local:80; + } + + upstream osmo-agent { + server osmo-agent.osmo.svc.cluster.local:80; + } + + server { + listen 80; + + # Common proxy headers + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # WebSocket support (conditional based on Upgrade header) + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $connection_upgrade; + + # Timeouts for long-running WebSocket connections (osmo-ctrl logging) + proxy_read_timeout 3600s; + proxy_send_timeout 3600s; + + # Route /api/logger/* to osmo-logger (WebSocket for log streaming) + location /api/logger/ { + proxy_pass http://osmo-logger; + } + + # Route /api/agent/* to osmo-agent (WebSocket for backend communication) + location /api/agent/ { + proxy_pass http://osmo-agent; + } + + # Everything else to osmo-service (REST API) + location / { + proxy_pass http://osmo-service; + } + } + } +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: osmo-proxy + namespace: osmo +spec: + replicas: 1 + selector: + matchLabels: + app: osmo-proxy + template: + metadata: + labels: + app: osmo-proxy + spec: + containers: + - name: nginx + image: nginx:alpine + ports: + - containerPort: 80 + volumeMounts: + - name: nginx-config + mountPath: /etc/nginx/nginx.conf + subPath: nginx.conf + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi + volumes: + - name: nginx-config + configMap: + name: osmo-proxy-nginx-config +--- +apiVersion: v1 +kind: Service +metadata: + name: osmo-proxy + namespace: osmo +spec: + selector: + app: osmo-proxy + ports: + - port: 80 + targetPort: 80 + type: ClusterIP diff --git a/applications/osmo/deploy/002-setup/osmo-values-noauth.yaml b/applications/osmo/deploy/002-setup/osmo-values-noauth.yaml new file mode 100755 index 000000000..53eb46662 --- /dev/null +++ b/applications/osmo/deploy/002-setup/osmo-values-noauth.yaml @@ -0,0 +1,170 @@ +# OSMO Service values - Auth Disabled +# For testing without authentication + +global: + osmoImageLocation: nvcr.io/nvidia/osmo + osmoImageTag: latest + imagePullPolicy: IfNotPresent + +services: + postgres: + enabled: false + serviceName: postgresql.osmo.svc.cluster.local + port: 5432 + db: osmo + user: osmo_admin + passwordSecretName: postgres-secret + passwordSecretKey: password + + redis: + enabled: false + serviceName: redis-master.osmo.svc.cluster.local + port: 6379 + tlsEnabled: false + + service: + scaling: + minReplicas: 1 + maxReplicas: 1 + ingress: + enabled: false + auth: + enabled: false + extraEnv: + - name: OSMO_POSTGRES_HOST + value: postgresql.osmo.svc.cluster.local + - name: OSMO_POSTGRES_PORT + value: "5432" + - name: OSMO_POSTGRES_USER + value: osmo_admin + - name: OSMO_POSTGRES_DATABASE + value: osmo + - name: OSMO_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + extraVolumes: + - name: vault-secrets + secret: + secretName: vault-secrets + extraVolumeMounts: + - name: vault-secrets + mountPath: /home/osmo/vault-agent/secrets + readOnly: true + + worker: + scaling: + minReplicas: 1 + maxReplicas: 1 + extraEnv: + - name: OSMO_POSTGRES_HOST + value: postgresql.osmo.svc.cluster.local + - name: OSMO_POSTGRES_PORT + value: "5432" + - name: OSMO_POSTGRES_USER + value: osmo_admin + - name: OSMO_POSTGRES_DATABASE + value: osmo + - name: OSMO_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + extraVolumes: + - name: vault-secrets + secret: + secretName: vault-secrets + extraVolumeMounts: + - name: vault-secrets + mountPath: /home/osmo/vault-agent/secrets + readOnly: true + + logger: + scaling: + minReplicas: 1 + maxReplicas: 1 + extraEnv: + - name: OSMO_POSTGRES_HOST + value: postgresql.osmo.svc.cluster.local + - name: OSMO_POSTGRES_PORT + value: "5432" + - name: OSMO_POSTGRES_USER + value: osmo_admin + - name: OSMO_POSTGRES_DATABASE + value: osmo + - name: OSMO_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + extraVolumes: + - name: vault-secrets + secret: + secretName: vault-secrets + extraVolumeMounts: + - name: vault-secrets + mountPath: /home/osmo/vault-agent/secrets + readOnly: true + + agent: + scaling: + minReplicas: 1 + maxReplicas: 1 + extraEnv: + - name: OSMO_POSTGRES_HOST + value: postgresql.osmo.svc.cluster.local + - name: OSMO_POSTGRES_PORT + value: "5432" + - name: OSMO_POSTGRES_USER + value: osmo_admin + - name: OSMO_POSTGRES_DATABASE + value: osmo + - name: OSMO_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + extraVolumes: + - name: vault-secrets + secret: + secretName: vault-secrets + extraVolumeMounts: + - name: vault-secrets + mountPath: /home/osmo/vault-agent/secrets + readOnly: true + + delayedJobMonitor: + replicas: 1 + extraEnv: + - name: OSMO_POSTGRES_HOST + value: postgresql.osmo.svc.cluster.local + - name: OSMO_POSTGRES_PORT + value: "5432" + - name: OSMO_POSTGRES_USER + value: osmo_admin + - name: OSMO_POSTGRES_DATABASE + value: osmo + - name: OSMO_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + extraVolumes: + - name: vault-secrets + secret: + secretName: vault-secrets + extraVolumeMounts: + - name: vault-secrets + mountPath: /home/osmo/vault-agent/secrets + readOnly: true + +sidecars: + envoy: + enabled: false + rateLimit: + enabled: false + logAgent: + enabled: false + otel: + enabled: false diff --git a/applications/osmo/deploy/002-setup/values/gpu-operator.yaml b/applications/osmo/deploy/002-setup/values/gpu-operator.yaml new file mode 100755 index 000000000..11cc02fdf --- /dev/null +++ b/applications/osmo/deploy/002-setup/values/gpu-operator.yaml @@ -0,0 +1,57 @@ +# GPU Operator Helm Values +# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator +# https://docs.nebius.com/kubernetes/gpu/set-up + +operator: + defaultRuntime: containerd + +# Enable driver installation by GPU Operator +# Even though Nebius nodes may have pre-installed drivers, the GPU Operator +# needs to manage the driver lifecycle for proper integration with device-plugin, +# toolkit, and other components. +driver: + enabled: true + # Let GPU Operator choose the appropriate driver version + # version: auto-detected by operator + upgradePolicy: + autoUpgrade: false # Don't auto-upgrade to avoid conflicts + +toolkit: + enabled: true + +devicePlugin: + enabled: true + config: + default: "any" + +dcgm: + enabled: true + +dcgmExporter: + enabled: true + serviceMonitor: + enabled: true + +gfd: + enabled: true + +migManager: + enabled: false + +nodeStatusExporter: + enabled: true + +# Node selector for GPU operator pods +node-feature-discovery: + worker: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + +# Tolerations for GPU workloads +daemonsets: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/applications/osmo/deploy/002-setup/values/grafana.yaml b/applications/osmo/deploy/002-setup/values/grafana.yaml new file mode 100755 index 000000000..ab8dd6b6b --- /dev/null +++ b/applications/osmo/deploy/002-setup/values/grafana.yaml @@ -0,0 +1,70 @@ +# Grafana Helm Values (standalone) +# https://github.com/grafana/helm-charts/tree/main/charts/grafana + +# Note: Grafana is typically deployed as part of kube-prometheus-stack +# This file is for standalone Grafana deployment if needed + +replicas: 1 + +adminUser: admin +# adminPassword should be set via --set or secret + +persistence: + enabled: true + size: 10Gi + storageClassName: "" + +resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + +# Datasources +datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + url: http://prometheus-kube-prometheus-prometheus:9090 + access: proxy + isDefault: true + - name: Loki + type: loki + url: http://loki:3100 + access: proxy + +# Dashboard providers +dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default + +# Sidecar for dashboards +sidecar: + dashboards: + enabled: true + label: grafana_dashboard + datasources: + enabled: true + label: grafana_datasource + +# Service +service: + type: ClusterIP + port: 80 + +# Ingress (disabled by default) +ingress: + enabled: false diff --git a/applications/osmo/deploy/002-setup/values/kai-scheduler.yaml b/applications/osmo/deploy/002-setup/values/kai-scheduler.yaml new file mode 100755 index 000000000..320c867db --- /dev/null +++ b/applications/osmo/deploy/002-setup/values/kai-scheduler.yaml @@ -0,0 +1,13 @@ +# KAI Scheduler Helm Values +# GPU-aware scheduler for OSMO +# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/dependencies/dependencies.html + +global: + # Modify the node selectors and tolerations to match your cluster + nodeSelector: {} + tolerations: [] + +scheduler: + additionalArgs: + - --default-staleness-grace-period=-1s # Disable staleness eviction + - --update-pod-eviction-condition=true # Enable OSMO to read preemption conditions diff --git a/applications/osmo/deploy/002-setup/values/loki.yaml b/applications/osmo/deploy/002-setup/values/loki.yaml new file mode 100755 index 000000000..f4c277a22 --- /dev/null +++ b/applications/osmo/deploy/002-setup/values/loki.yaml @@ -0,0 +1,68 @@ +# Loki Stack Helm Values +# https://github.com/grafana/helm-charts/tree/main/charts/loki-stack + +loki: + enabled: true + + persistence: + enabled: true + size: 50Gi + + config: + auth_enabled: false + + server: + http_listen_port: 3100 + + ingester: + lifecycler: + ring: + kvstore: + store: inmemory + replication_factor: 1 + chunk_idle_period: 15m + chunk_retain_period: 30s + + schema_config: + configs: + - from: 2020-01-01 + store: boltdb-shipper + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 24h + + storage_config: + boltdb_shipper: + active_index_directory: /data/loki/boltdb-shipper-active + cache_location: /data/loki/boltdb-shipper-cache + shared_store: filesystem + filesystem: + directory: /data/loki/chunks + + limits_config: + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h + max_entries_limit_per_query: 5000 + + table_manager: + retention_deletes_enabled: true + retention_period: 168h # 7 days + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 1Gi + +# Promtail is deployed separately +promtail: + enabled: false + +# Grafana is deployed via kube-prometheus-stack +grafana: + enabled: false diff --git a/applications/osmo/deploy/002-setup/values/network-operator.yaml b/applications/osmo/deploy/002-setup/values/network-operator.yaml new file mode 100755 index 000000000..146a9daca --- /dev/null +++ b/applications/osmo/deploy/002-setup/values/network-operator.yaml @@ -0,0 +1,62 @@ +# Network Operator Helm Values +# https://docs.nvidia.com/networking/display/cokan10/network+operator + +# Operator settings +operator: + nodeSelector: + node-role.kubernetes.io/control-plane: "" + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + +# RDMA shared device plugin (for InfiniBand) +rdmaSharedDevicePlugin: + deploy: true + resources: + - name: rdma_shared_device_a + vendors: [15b3] + deviceIDs: [101b, 101d, 1017, 1019] + ifNames: ["*"] + +# SR-IOV device plugin +sriovDevicePlugin: + deploy: false + +# NIC cluster policy +nicClusterPolicy: + deploy: true + + # RDMA + rdmaSharedDevicePlugin: + image: k8s-rdma-shared-dev-plugin + repository: ghcr.io/mellanox + version: sha-4f3eb55 + +# Secondary network +secondaryNetwork: + deploy: true + + # Multus CNI + multus: + deploy: true + image: multus-cni + repository: ghcr.io/k8snetworkplumbingwg + version: v3.9.3 + + # CNI plugins + cniPlugins: + deploy: true + image: plugins + repository: ghcr.io/k8snetworkplumbingwg + version: v1.3.0 + + # IPAM plugin + ipamPlugin: + deploy: true + image: whereabouts + repository: ghcr.io/k8snetworkplumbingwg + version: v0.6.2 diff --git a/applications/osmo/deploy/002-setup/values/osmo-backend-operator.yaml b/applications/osmo/deploy/002-setup/values/osmo-backend-operator.yaml new file mode 100755 index 000000000..b4781ae21 --- /dev/null +++ b/applications/osmo/deploy/002-setup/values/osmo-backend-operator.yaml @@ -0,0 +1,37 @@ +# OSMO Backend Operator Values +# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/deploy_backend.html + +global: + # REQUIRED: OSMO image tag (e.g., 6.0.0) + osmoImageTag: "6.0.0" + + # REQUIRED: Your OSMO service URL + serviceUrl: "https://osmo.example.com" + + # Namespaces + agentNamespace: "osmo-operator" + backendNamespace: "osmo-workflows" + + # REQUIRED: Unique name for this backend + backendName: "nebius-backend" + + # Authentication + accountTokenSecret: "osmo-operator-token" + loginMethod: "token" + + # Resource configuration + services: + backendListener: + resources: + requests: + cpu: "1" + memory: "1Gi" + limits: + memory: "1Gi" + backendWorker: + resources: + requests: + cpu: "1" + memory: "1Gi" + limits: + memory: "1Gi" diff --git a/applications/osmo/deploy/002-setup/values/prometheus.yaml b/applications/osmo/deploy/002-setup/values/prometheus.yaml new file mode 100755 index 000000000..c97a75692 --- /dev/null +++ b/applications/osmo/deploy/002-setup/values/prometheus.yaml @@ -0,0 +1,107 @@ +# Prometheus Stack Helm Values +# https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack + +# Prometheus +prometheus: + prometheusSpec: + retention: 15d + + resources: + requests: + cpu: 500m + memory: 2Gi + limits: + cpu: 2000m + memory: 8Gi + + storageSpec: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 50Gi + + # Service monitors + serviceMonitorSelectorNilUsesHelmValues: false + podMonitorSelectorNilUsesHelmValues: false + +# Grafana +grafana: + enabled: true + + adminUser: admin + # adminPassword is set via --set flag + + persistence: + enabled: true + size: 10Gi + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + + # Additional datasources + additionalDataSources: + - name: Loki + type: loki + url: http://loki:3100 + access: proxy + isDefault: false + + # Dashboards + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default + + # GPU dashboard + dashboards: + default: + nvidia-dcgm: + gnetId: 12239 + revision: 2 + datasource: Prometheus + +# Alertmanager +alertmanager: + enabled: true + + alertmanagerSpec: + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + +# Node exporter +nodeExporter: + enabled: true + +# Kube state metrics +kubeStateMetrics: + enabled: true + +# Prometheus operator +prometheusOperator: + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi diff --git a/applications/osmo/deploy/002-setup/values/promtail.yaml b/applications/osmo/deploy/002-setup/values/promtail.yaml new file mode 100755 index 000000000..601d29e57 --- /dev/null +++ b/applications/osmo/deploy/002-setup/values/promtail.yaml @@ -0,0 +1,46 @@ +# Promtail Helm Values +# https://github.com/grafana/helm-charts/tree/main/charts/promtail + +config: + clients: + - url: http://loki:3100/loki/api/v1/push + + snippets: + pipelineStages: + - cri: {} + - json: + expressions: + level: level + message: msg + - labels: + level: + - output: + source: message + +# Resources +resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + +# Tolerations to run on all nodes +tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + +# Volume mounts (for containerd logs if needed) +# Note: The default chart already mounts /var/lib/docker and /var/log +# Only add extra volumes if you need additional paths +extraVolumes: [] +extraVolumeMounts: [] + +# Service monitor +serviceMonitor: + enabled: true diff --git a/applications/osmo/deploy/README.md b/applications/osmo/deploy/README.md new file mode 100755 index 000000000..21258c450 --- /dev/null +++ b/applications/osmo/deploy/README.md @@ -0,0 +1,168 @@ +# Deployment Guide + +This directory contains all deployment artifacts for OSMO on Nebius. + +## Deployment Phases + +### Phase 0: Prerequisites (`000-prerequisites/`) + +Install required tools and configure your Nebius environment. + +```bash +cd 000-prerequisites + +# Install required tools (Terraform, kubectl, Helm, Nebius CLI) +./install-tools.sh + +# Check if tools are installed +./install-tools.sh --check + +# Configure Nebius environment +source ./nebius-env-init.sh + +# (Recommended) Initialize secrets in MysteryBox +source ./secrets-init.sh +``` + +### Phase 1: Infrastructure (`001-iac/`) + +Deploy cloud infrastructure using Terraform. + +```bash +cd 001-iac + +# Recommended: Cost-optimized with secure private access +cp terraform.tfvars.cost-optimized-secure.example terraform.tfvars + +# Other options: +# terraform.tfvars.cost-optimized.example - Cheapest (public endpoints) +# terraform.tfvars.production.example - Full production setup +# terraform.tfvars.secure.example - H100 with WireGuard + +# Edit configuration +vim terraform.tfvars + +# Deploy +terraform init +terraform plan +terraform apply +``` + +**Resources Created:** +- VPC Network and Subnet +- Managed Kubernetes Cluster +- CPU and GPU Node Groups +- Managed PostgreSQL +- Object Storage Buckets +- Filestore (Shared Filesystem) +- Container Registry +- Service Accounts +- WireGuard VPN (optional) + +### Phase 2: Kubernetes Setup (`002-setup/`) + +Configure Kubernetes with GPU infrastructure and OSMO. + +```bash +cd 002-setup + +# 1. Deploy GPU infrastructure +./01-deploy-gpu-infrastructure.sh + +# 2. Deploy observability stack +./02-deploy-observability.sh + +# 3. Deploy OSMO control plane +./03-deploy-osmo-control-plane.sh + +# 4. Deploy OSMO backend +./04-deploy-osmo-backend.sh +``` + +## Directory Structure + +``` +deploy/ +├── 000-prerequisites/ +│ ├── install-tools.sh # Tool installer +│ ├── nebius-env-init.sh # Environment setup +│ ├── secrets-init.sh # MysteryBox secrets setup +│ ├── wireguard-client-setup.sh # WireGuard client config +│ └── README.md +├── 001-iac/ +│ ├── modules/ +│ │ ├── platform/ # VPC, Storage, DB, Registry +│ │ ├── k8s/ # Kubernetes cluster +│ │ └── wireguard/ # VPN infrastructure +│ ├── main.tf # Root module +│ ├── variables.tf # Input variables +│ ├── outputs.tf # Output values +│ ├── versions.tf # Provider versions +│ ├── terraform.tfvars.*.example +│ └── README.md +└── 002-setup/ + ├── lib/ + │ └── common.sh # Shared functions + ├── values/ # Helm values files + ├── 01-deploy-gpu-infrastructure.sh + ├── 02-deploy-observability.sh + ├── 03-deploy-osmo-control-plane.sh + ├── 04-deploy-osmo-backend.sh + ├── cleanup/ # Uninstall scripts + └── README.md +``` + +## Configuration Files + +| File | Purpose | Recommended | +|------|---------|-------------| +| `terraform.tfvars.cost-optimized-secure.example` | Cheap + secure (L40S + VPN) | **Recommended** | +| `terraform.tfvars.cost-optimized.example` | Cheapest (L40S, public) | Dev only | +| `terraform.tfvars.production.example` | Full production (H200 + VPN) | Production | +| `terraform.tfvars.secure.example` | H100 + VPN | Staging | + +## Environment Variables + +After running `nebius-env-init.sh`, these variables are set: + +| Variable | Description | +|----------|-------------| +| `NEBIUS_TENANT_ID` | Your Nebius tenant ID | +| `NEBIUS_PROJECT_ID` | Your Nebius project ID | +| `NEBIUS_REGION` | Deployment region | +| `TF_VAR_tenant_id` | Terraform variable for tenant | +| `TF_VAR_parent_id` | Terraform variable for project | +| `TF_VAR_region` | Terraform variable for region | + +## Cleanup + +To remove all deployed resources: + +```bash +# 1. Remove Kubernetes components +cd 002-setup/cleanup +./uninstall-osmo-backend.sh +./uninstall-osmo-control-plane.sh +./uninstall-observability.sh +./uninstall-gpu-infrastructure.sh + +# 2. Destroy infrastructure +cd ../../001-iac +terraform destroy +``` + +## Troubleshooting + +### Terraform Errors + +1. **Authentication failed**: Run `source ../000-prerequisites/nebius-env-init.sh` +2. **Resource quota exceeded**: Check Nebius console for quota limits +3. **Invalid region**: Verify region supports required GPU types + +### Kubernetes Errors + +1. **Nodes not ready**: Check GPU operator pod logs +2. **Pods pending**: Verify node group scaling +3. **Network issues**: Check Cilium pod status + +See [Troubleshooting Guide](../docs/troubleshooting.md) for more details. diff --git a/applications/osmo/workflows/README.md b/applications/osmo/workflows/README.md new file mode 100755 index 000000000..c5a9c2c63 --- /dev/null +++ b/applications/osmo/workflows/README.md @@ -0,0 +1,156 @@ +# Workflow Templates + +OSMO workflow templates for training jobs on Nebius. + +## Available Workflows + +| File | Description | GPUs | +|------|-------------|------| +| `osmo/hello_nebius.yaml` | Hello World example with GPU | 1 | +| `osmo/gpu_test.yaml` | GPU validation test | 1 | +| `osmo/train.yaml` | Single GPU training | 1 | +| `osmo/train-multi-gpu.yaml` | Multi-GPU distributed training | 8 | + +## Quick Start + +### Test CPU Workflow + +```bash +osmo workflow submit osmo/hello_nebius.yaml +``` + +This workflow runs on a GPU node and prints "Hello Nebius!". + +### Test GPU Access + +```bash +osmo workflow submit osmo/gpu_test.yaml +``` + +This workflow validates GPU availability by running `nvidia-smi` on a Nebius L40S node. + +> **Note**: GPU workflows require the GPU platform to be configured. See [Configure OSMO GPU Platform](../deploy/002-setup/README.md#configure-osmo-gpu-platform). + +## Usage + +### Submit via Script + +```bash +cd ../scripts +./submit-osmo-training.sh -w ../workflows/osmo/train.yaml +``` + +### Submit Directly + +```bash +# Single GPU +kubectl apply -f osmo/train.yaml + +# Multi-GPU +kubectl apply -f osmo/train-multi-gpu.yaml +``` + +## Workflow Structure + +### Single GPU (`train.yaml`) + +Best for: +- Development and debugging +- Small models +- Inference testing + +Resources: +- 1 GPU +- 64 GB memory +- 8 vCPUs + +### Multi-GPU (`train-multi-gpu.yaml`) + +Best for: +- Large model training +- Distributed training +- Production workloads + +Resources: +- 8 GPUs +- 1400 GB memory +- 120 vCPUs + +Features: +- InfiniBand for NCCL +- Shared memory for GPU communication +- Node affinity for GPU cluster + +## Customization + +### Change Training Image + +```yaml +containers: + - name: training + image: your-registry/your-image:tag +``` + +### Add Training Data + +```yaml +volumeMounts: + - name: shared-data + mountPath: /data +``` + +### Configure Environment + +```yaml +env: + - name: LEARNING_RATE + value: "0.001" + - name: BATCH_SIZE + value: "32" +``` + +### Add GPU Resources + +```yaml +resources: + limits: + nvidia.com/gpu: 8 +``` + +## Environment Variables + +### NCCL Configuration + +| Variable | Description | Default | +|----------|-------------|---------| +| `NCCL_DEBUG` | Debug level (INFO, WARN) | INFO | +| `NCCL_IB_DISABLE` | Disable InfiniBand (0/1) | 0 | +| `NCCL_NET_GDR_LEVEL` | GPUDirect RDMA level | 5 | + +### PyTorch Distributed + +| Variable | Description | +|----------|-------------| +| `MASTER_ADDR` | Master node address | +| `MASTER_PORT` | Master node port | +| `WORLD_SIZE` | Total number of processes | +| `RANK` | Process rank | + +## Monitoring + +### View Job Status + +```bash +kubectl get jobs -n osmo +kubectl get pods -n osmo -l app=osmo-training +``` + +### View Logs + +```bash +kubectl logs -n osmo -l job-name= -f +``` + +### GPU Metrics + +Access Grafana dashboard for GPU utilization metrics. diff --git a/applications/osmo/workflows/osmo/gpu_test.yaml b/applications/osmo/workflows/osmo/gpu_test.yaml new file mode 100755 index 000000000..1075d4a78 --- /dev/null +++ b/applications/osmo/workflows/osmo/gpu_test.yaml @@ -0,0 +1,56 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# GPU Test Workflow for Nebius +# Validates GPU availability and CUDA functionality on Nebius L40S nodes +# +# Submit with: +# osmo workflow submit workflows/osmo/gpu_test.yaml +# +# Or via curl (with port-forward to osmo-service:8080): +# curl -X POST http://localhost:8080/api/workflow -H "Content-Type: application/yaml" --data-binary @workflows/osmo/gpu_test.yaml + +workflow: + name: gpu-test-nebius + resources: + gpu-resource: + platform: gpu + gpu: 1 + memory: 4Gi + cpu: 2 + tasks: + - name: check-gpu + image: nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04 + command: ["/bin/bash", "-c"] + args: + - | + echo "=== GPU Test on Nebius ===" + echo "" + echo "=== nvidia-smi output ===" + nvidia-smi + echo "" + echo "=== CUDA Version ===" + nvcc --version 2>/dev/null || echo "nvcc not available (base image)" + echo "" + echo "=== GPU Memory Info ===" + nvidia-smi --query-gpu=name,memory.total,memory.free,memory.used --format=csv + echo "" + echo "=== Environment ===" + echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-not set}" + echo "NVIDIA_VISIBLE_DEVICES: ${NVIDIA_VISIBLE_DEVICES:-not set}" + echo "" + echo "=== GPU Test Complete ===" + resource: gpu-resource diff --git a/applications/osmo/workflows/osmo/hello_nebius.yaml b/applications/osmo/workflows/osmo/hello_nebius.yaml new file mode 100755 index 000000000..44a677d10 --- /dev/null +++ b/applications/osmo/workflows/osmo/hello_nebius.yaml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +workflow: + name: hello-nebius + resources: + gpu-resource: + platform: gpu + gpu: 1 + memory: 2Gi + storage: 2Gi + tasks: + - name: hello + image: ubuntu:24.04 + command: ["echo"] + args: ["Hello Nebius!"] + resource: gpu-resource From 7760b4e90e3a9a628922dc4fbd48cc3d69976549 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Mon, 2 Feb 2026 15:01:11 -0800 Subject: [PATCH 02/37] fixed some scripts --- .../000-prerequisites/nebius-env-init.sh | 31 ++++++++++++------- .../deploy/000-prerequisites/secrets-init.sh | 6 ++-- .../002-setup/01-deploy-gpu-infrastructure.sh | 9 +++++- .../002-setup/02-deploy-observability.sh | 9 +++++- .../002-setup/03-deploy-osmo-control-plane.sh | 5 ++- .../002-setup/04-deploy-osmo-backend.sh | 3 +- .../deploy/002-setup/05-configure-storage.sh | 9 +++++- .../002-setup/06-configure-service-url.sh | 9 +++++- .../002-setup/07-configure-gpu-platform.sh | 9 +++++- .../cleanup/uninstall-gpu-infrastructure.sh | 12 +++++-- .../cleanup/uninstall-observability.sh | 12 +++++-- .../cleanup/uninstall-osmo-backend.sh | 12 +++++-- .../cleanup/uninstall-osmo-control-plane.sh | 12 +++++-- .../osmo/deploy/002-setup/defaults.sh | 11 +++++-- 14 files changed, 117 insertions(+), 32 deletions(-) diff --git a/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh b/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh index c95a32e37..deeacc469 100755 --- a/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh +++ b/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh @@ -82,14 +82,16 @@ prompt_with_default() { local default=$2 local var_name=$3 local value - + if [[ -n "$default" ]]; then - read -p "$prompt [$default]: " value + printf "%s [%s]: " "$prompt" "$default" + read value value=${value:-$default} else - read -p "$prompt: " value + printf "%s: " "$prompt" + read value fi - + eval "$var_name='$value'" } @@ -162,15 +164,18 @@ select_or_create_project() { echo "" local choice - read -p "Choose option [1/2/3]: " choice + printf "Choose option [1/2/3]: " + read choice case $choice in 1) - read -p "Enter Project ID: " NEBIUS_PROJECT_ID + printf "Enter Project ID: " + read NEBIUS_PROJECT_ID ;; 2) local project_name - read -p "Enter new project name: " project_name + printf "Enter new project name: " + read project_name if [[ -z "$project_name" ]]; then echo -e "${RED}[ERROR]${NC} Project name cannot be empty" @@ -193,11 +198,13 @@ select_or_create_project() { 3) list_projects "$tenant_id" echo "" - read -p "Enter Project ID from the list above (or 'new' to create): " input + printf "Enter Project ID from the list above (or 'new' to create): " + read input if [[ "$input" == "new" ]]; then local project_name - read -p "Enter new project name: " project_name + printf "Enter new project name: " + read project_name if [[ -z "$project_name" ]]; then echo -e "${RED}[ERROR]${NC} Project name cannot be empty" @@ -310,7 +317,8 @@ main() { else echo "" echo "Current project: $current_project" - read -p "Use this project? (Y/n/new): " use_current + printf "Use this project? (Y/n/new): " + read use_current case $use_current in n|N) @@ -320,7 +328,8 @@ main() { ;; new) local project_name - read -p "Enter new project name: " project_name + printf "Enter new project name: " + read project_name NEBIUS_PROJECT_ID=$(create_project "$NEBIUS_TENANT_ID" "$project_name") if [[ $? -ne 0 || -z "$NEBIUS_PROJECT_ID" ]]; then return 1 diff --git a/applications/osmo/deploy/000-prerequisites/secrets-init.sh b/applications/osmo/deploy/000-prerequisites/secrets-init.sh index bbdec3e19..534c2c389 100755 --- a/applications/osmo/deploy/000-prerequisites/secrets-init.sh +++ b/applications/osmo/deploy/000-prerequisites/secrets-init.sh @@ -233,7 +233,8 @@ create_postgresql_secret() { if [[ -n "$existing_id" ]]; then echo -e "${YELLOW}[!]${NC} Secret '$POSTGRESQL_SECRET_NAME' already exists (ID: $existing_id)" - read -p " Replace existing secret? (y/N): " replace + printf " Replace existing secret? (y/N): " + read replace if [[ "$replace" =~ ^[Yy]$ ]]; then echo " Deleting existing secret..." delete_secret "$existing_id" @@ -278,7 +279,8 @@ create_mek_secret() { if [[ -n "$existing_id" ]]; then echo -e "${YELLOW}[!]${NC} Secret '$MEK_SECRET_NAME' already exists (ID: $existing_id)" - read -p " Replace existing secret? (y/N): " replace + printf " Replace existing secret? (y/N): " + read replace if [[ "$replace" =~ ^[Yy]$ ]]; then echo " Deleting existing secret..." delete_secret "$existing_id" diff --git a/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh b/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh index 58e6f1241..d4b9b04ad 100755 --- a/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh +++ b/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh @@ -5,7 +5,14 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Determine script directory (works in bash and zsh) +if [[ -n "${BASH_SOURCE[0]:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +elif [[ -n "${ZSH_VERSION:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" +else + SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +fi source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" diff --git a/applications/osmo/deploy/002-setup/02-deploy-observability.sh b/applications/osmo/deploy/002-setup/02-deploy-observability.sh index ef3c22f13..38c4692bc 100755 --- a/applications/osmo/deploy/002-setup/02-deploy-observability.sh +++ b/applications/osmo/deploy/002-setup/02-deploy-observability.sh @@ -5,7 +5,14 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Determine script directory (works in bash and zsh) +if [[ -n "${BASH_SOURCE[0]:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +elif [[ -n "${ZSH_VERSION:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" +else + SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +fi source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" diff --git a/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh index 4b53b718d..c4efd0f50 100755 --- a/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh +++ b/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh @@ -83,11 +83,10 @@ log_info "Using Nebius Managed PostgreSQL..." POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-${OSMO_POSTGRES_PASSWORD:-""}} if [[ -z "$POSTGRES_HOST" ]]; then - read -p "PostgreSQL Host: " POSTGRES_HOST + read_prompt_var "PostgreSQL Host" POSTGRES_HOST "" fi if [[ -z "$POSTGRES_PASSWORD" ]]; then - read -s -p "PostgreSQL Password: " POSTGRES_PASSWORD - echo "" + read_secret_var "PostgreSQL Password" POSTGRES_PASSWORD fi fi diff --git a/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh b/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh index 8a57177bf..5e50c55df 100755 --- a/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh +++ b/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh @@ -147,7 +147,8 @@ if [[ -z "${OSMO_SERVICE_TOKEN:-}" ]]; then --service --roles osmo-backend 2>&1) # Extract token from output (format: "Access token: ") - OSMO_SERVICE_TOKEN=$(echo "$TOKEN_OUTPUT" | grep -oP 'Access token: \K.*' || echo "") + # Note: Using sed instead of grep -P for macOS compatibility + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_OUTPUT" | sed -n 's/.*Access token: \(.*\)/\1/p' || echo "") if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then log_error "Failed to create service token" diff --git a/applications/osmo/deploy/002-setup/05-configure-storage.sh b/applications/osmo/deploy/002-setup/05-configure-storage.sh index 47e7a9d53..843897699 100755 --- a/applications/osmo/deploy/002-setup/05-configure-storage.sh +++ b/applications/osmo/deploy/002-setup/05-configure-storage.sh @@ -6,7 +6,14 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Determine script directory (works in bash and zsh) +if [[ -n "${BASH_SOURCE[0]:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +elif [[ -n "${ZSH_VERSION:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" +else + SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +fi source "${SCRIPT_DIR}/lib/common.sh" echo "" diff --git a/applications/osmo/deploy/002-setup/06-configure-service-url.sh b/applications/osmo/deploy/002-setup/06-configure-service-url.sh index 76c4ee481..de73f55c8 100755 --- a/applications/osmo/deploy/002-setup/06-configure-service-url.sh +++ b/applications/osmo/deploy/002-setup/06-configure-service-url.sh @@ -6,7 +6,14 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Determine script directory (works in bash and zsh) +if [[ -n "${BASH_SOURCE[0]:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +elif [[ -n "${ZSH_VERSION:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" +else + SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +fi source "${SCRIPT_DIR}/lib/common.sh" echo "" diff --git a/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh b/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh index aa371b74f..6c8be3a71 100755 --- a/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh +++ b/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh @@ -4,7 +4,14 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Determine script directory (works in bash and zsh) +if [[ -n "${BASH_SOURCE[0]:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +elif [[ -n "${ZSH_VERSION:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" +else + SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +fi source "${SCRIPT_DIR}/lib/common.sh" OSMO_URL="${OSMO_URL:-http://localhost:8080}" diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-gpu-infrastructure.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-gpu-infrastructure.sh index 446217ada..0a13bb5ae 100755 --- a/applications/osmo/deploy/002-setup/cleanup/uninstall-gpu-infrastructure.sh +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-gpu-infrastructure.sh @@ -5,7 +5,14 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Determine script directory (works in bash and zsh) +if [[ -n "${BASH_SOURCE[0]:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +elif [[ -n "${ZSH_VERSION:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" +else + SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +fi source "${SCRIPT_DIR}/../lib/common.sh" source "${SCRIPT_DIR}/../defaults.sh" @@ -16,7 +23,8 @@ echo "========================================" echo "" log_warning "This will remove GPU Operator, Network Operator, and KAI Scheduler" -read -p "Continue? (y/N): " confirm +printf "Continue? (y/N): " +read confirm if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then log_info "Cancelled" exit 0 diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-observability.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-observability.sh index 4ba162619..8326e2683 100755 --- a/applications/osmo/deploy/002-setup/cleanup/uninstall-observability.sh +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-observability.sh @@ -5,7 +5,14 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Determine script directory (works in bash and zsh) +if [[ -n "${BASH_SOURCE[0]:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +elif [[ -n "${ZSH_VERSION:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" +else + SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +fi source "${SCRIPT_DIR}/../lib/common.sh" source "${SCRIPT_DIR}/../defaults.sh" @@ -16,7 +23,8 @@ echo "========================================" echo "" log_warning "This will remove Prometheus, Grafana, and Loki" -read -p "Continue? (y/N): " confirm +printf "Continue? (y/N): " +read confirm if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then log_info "Cancelled" exit 0 diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh index 9967324f8..90a66b9e4 100755 --- a/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh @@ -5,7 +5,14 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Determine script directory (works in bash and zsh) +if [[ -n "${BASH_SOURCE[0]:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +elif [[ -n "${ZSH_VERSION:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" +else + SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +fi source "${SCRIPT_DIR}/../lib/common.sh" source "${SCRIPT_DIR}/../defaults.sh" @@ -16,7 +23,8 @@ echo "========================================" echo "" log_warning "This will remove OSMO Backend services" -read -p "Continue? (y/N): " confirm +printf "Continue? (y/N): " +read confirm if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then log_info "Cancelled" exit 0 diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-control-plane.sh index 63e44420b..fed544c6f 100755 --- a/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-control-plane.sh +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-control-plane.sh @@ -5,7 +5,14 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Determine script directory (works in bash and zsh) +if [[ -n "${BASH_SOURCE[0]:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +elif [[ -n "${ZSH_VERSION:-}" ]]; then + SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" +else + SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +fi source "${SCRIPT_DIR}/../lib/common.sh" source "${SCRIPT_DIR}/../defaults.sh" @@ -16,7 +23,8 @@ echo "========================================" echo "" log_warning "This will remove OSMO Control Plane and all OSMO resources" -read -p "Continue? (y/N): " confirm +printf "Continue? (y/N): " +read confirm if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then log_info "Cancelled" exit 0 diff --git a/applications/osmo/deploy/002-setup/defaults.sh b/applications/osmo/deploy/002-setup/defaults.sh index 14aae0071..8cce2538d 100755 --- a/applications/osmo/deploy/002-setup/defaults.sh +++ b/applications/osmo/deploy/002-setup/defaults.sh @@ -31,7 +31,14 @@ export PROMETHEUS_RETENTION_DAYS="15" export LOKI_RETENTION_DAYS="7" export GRAFANA_ADMIN_PASSWORD="" # Auto-generated if empty -# Paths -export SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Paths (compatible with bash and zsh) +if [[ -n "${BASH_SOURCE[0]:-}" ]]; then + export SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +elif [[ -n "${ZSH_VERSION:-}" ]]; then + # zsh - use %x prompt expansion for script path + export SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" +else + export SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +fi export VALUES_DIR="${SCRIPT_DIR}/values" export LIB_DIR="${SCRIPT_DIR}/lib" From 65072622145e4777af08de918c1da9c7e6be81a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Mon, 2 Feb 2026 15:29:23 -0800 Subject: [PATCH 03/37] - add endpoint - endale Auth --- .../osmo/deploy/001-iac/osmo-proxy.tf | 216 ++++++++++++++ applications/osmo/deploy/001-iac/outputs.tf | 33 ++- applications/osmo/deploy/001-iac/versions.tf | 14 + .../002-setup/03-deploy-osmo-control-plane.sh | 121 +++++++- .../osmo/deploy/002-setup/SSO-SETUP.md | 275 ++++++++++++++++++ .../osmo/deploy/002-setup/nginx-proxy.yaml | 62 +++- 6 files changed, 693 insertions(+), 28 deletions(-) create mode 100644 applications/osmo/deploy/001-iac/osmo-proxy.tf create mode 100644 applications/osmo/deploy/002-setup/SSO-SETUP.md diff --git a/applications/osmo/deploy/001-iac/osmo-proxy.tf b/applications/osmo/deploy/001-iac/osmo-proxy.tf new file mode 100644 index 000000000..e50af6e03 --- /dev/null +++ b/applications/osmo/deploy/001-iac/osmo-proxy.tf @@ -0,0 +1,216 @@ +# ============================================================================= +# OSMO Proxy LoadBalancer Service +# ============================================================================= +# Creates the OSMO namespace and LoadBalancer service in Terraform so that +# the external IP can be output. The nginx deployment is created by the +# shell scripts in 002-setup. + +# ----------------------------------------------------------------------------- +# OSMO Namespace +# ----------------------------------------------------------------------------- +resource "kubernetes_namespace_v1" "osmo" { + metadata { + name = "osmo" + } + + depends_on = [module.k8s] +} + +# ----------------------------------------------------------------------------- +# OSMO Proxy ConfigMap (nginx configuration) +# ----------------------------------------------------------------------------- +resource "kubernetes_config_map_v1" "osmo_proxy_nginx" { + metadata { + name = "osmo-proxy-nginx-config" + namespace = kubernetes_namespace_v1.osmo.metadata[0].name + } + + data = { + "nginx.conf" = <<-EOF + events { + worker_connections 1024; + } + + http { + # Logging + access_log /dev/stdout; + error_log /dev/stderr; + + # Conditional WebSocket support + map $http_upgrade $connection_upgrade { + default upgrade; + '' close; + } + + # Upstream servers + upstream osmo-service { + server osmo-service.osmo.svc.cluster.local:80; + } + + upstream osmo-logger { + server osmo-logger.osmo.svc.cluster.local:80; + } + + upstream osmo-agent { + server osmo-agent.osmo.svc.cluster.local:80; + } + + upstream osmo-ui { + server osmo-ui.osmo.svc.cluster.local:80; + } + + upstream keycloak { + server keycloak.osmo.svc.cluster.local:80; + } + + server { + listen 80; + + # Common proxy headers + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # WebSocket support + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $connection_upgrade; + + # Timeouts for long-running WebSocket connections + proxy_read_timeout 3600s; + proxy_send_timeout 3600s; + + # Route /api/logger/* to osmo-logger (WebSocket for log streaming) + location /api/logger/ { + proxy_pass http://osmo-logger; + } + + # Route /api/agent/* to osmo-agent (WebSocket for backend communication) + location /api/agent/ { + proxy_pass http://osmo-agent; + } + + # Route /api/* to osmo-service (REST API) + location /api/ { + proxy_pass http://osmo-service; + } + + # Route /auth/* and /realms/* to Keycloak (SSO with Google/Azure AD) + location /auth/ { + proxy_pass http://keycloak; + } + + location /realms/ { + proxy_pass http://keycloak; + } + + location /admin/ { + proxy_pass http://keycloak; + } + + location /js/ { + proxy_pass http://keycloak; + } + + location /resources/ { + proxy_pass http://keycloak; + } + + # Route everything else to osmo-ui (Web UI) + location / { + proxy_pass http://osmo-ui; + } + } + } + EOF + } +} + +# ----------------------------------------------------------------------------- +# OSMO Proxy Deployment +# ----------------------------------------------------------------------------- +resource "kubernetes_deployment_v1" "osmo_proxy" { + metadata { + name = "osmo-proxy" + namespace = kubernetes_namespace_v1.osmo.metadata[0].name + } + + spec { + replicas = 1 + + selector { + match_labels = { + app = "osmo-proxy" + } + } + + template { + metadata { + labels = { + app = "osmo-proxy" + } + } + + spec { + container { + name = "nginx" + image = "nginx:alpine" + + port { + container_port = 80 + } + + volume_mount { + name = "nginx-config" + mount_path = "/etc/nginx/nginx.conf" + sub_path = "nginx.conf" + } + + resources { + requests = { + cpu = "50m" + memory = "64Mi" + } + limits = { + cpu = "200m" + memory = "128Mi" + } + } + } + + volume { + name = "nginx-config" + config_map { + name = kubernetes_config_map_v1.osmo_proxy_nginx.metadata[0].name + } + } + } + } + } +} + +# ----------------------------------------------------------------------------- +# OSMO Proxy LoadBalancer Service +# ----------------------------------------------------------------------------- +resource "kubernetes_service_v1" "osmo_proxy" { + metadata { + name = "osmo-proxy" + namespace = kubernetes_namespace_v1.osmo.metadata[0].name + } + + spec { + selector = { + app = "osmo-proxy" + } + + port { + port = 80 + target_port = 80 + } + + type = "LoadBalancer" + } + + depends_on = [kubernetes_deployment_v1.osmo_proxy] +} diff --git a/applications/osmo/deploy/001-iac/outputs.tf b/applications/osmo/deploy/001-iac/outputs.tf index 101b52b25..4808125a4 100755 --- a/applications/osmo/deploy/001-iac/outputs.tf +++ b/applications/osmo/deploy/001-iac/outputs.tf @@ -132,37 +132,54 @@ output "wireguard" { } : null } +# ----------------------------------------------------------------------------- +# OSMO Proxy Outputs +# ----------------------------------------------------------------------------- +output "osmo_proxy" { + description = "OSMO Proxy LoadBalancer details" + value = { + external_ip = try(kubernetes_service_v1.osmo_proxy.status[0].load_balancer[0].ingress[0].ip, null) + url = try("http://${kubernetes_service_v1.osmo_proxy.status[0].load_balancer[0].ingress[0].ip}", null) + ui_url = try("http://${kubernetes_service_v1.osmo_proxy.status[0].load_balancer[0].ingress[0].ip}/", null) + api_url = try("http://${kubernetes_service_v1.osmo_proxy.status[0].load_balancer[0].ingress[0].ip}/api/", null) + } +} + # ----------------------------------------------------------------------------- # Connection Instructions # ----------------------------------------------------------------------------- output "next_steps" { description = "Next steps after deployment" value = <<-EOT - + ======================================== OSMO on Nebius - Deployment Complete ======================================== - + 1. Get Kubernetes credentials: nebius mk8s cluster get-credentials --id ${module.k8s.cluster_id} --external - + 2. Verify cluster access: kubectl get nodes - + ${var.enable_wireguard ? "3. Set up WireGuard VPN:\n cd ../000-prerequisites && ./wireguard-client-setup.sh\n WireGuard UI: ${module.wireguard[0].ui_url}\n \n 4. " : "3. "}Deploy OSMO components: cd ../002-setup ./01-deploy-gpu-infrastructure.sh ./02-deploy-observability.sh ./03-deploy-osmo-control-plane.sh ./04-deploy-osmo-backend.sh - + + OSMO Access: + UI: ${try("http://${kubernetes_service_v1.osmo_proxy.status[0].load_balancer[0].ingress[0].ip}/", "(LoadBalancer IP pending)")} + API: ${try("http://${kubernetes_service_v1.osmo_proxy.status[0].load_balancer[0].ingress[0].ip}/api/", "(LoadBalancer IP pending)")} + ${var.enable_managed_postgresql ? "PostgreSQL Connection (Managed):\n Host: ${module.platform.postgresql_host}\n Port: ${module.platform.postgresql_port}\n Database: ${module.platform.postgresql_database}\n Username: ${module.platform.postgresql_username}" : "PostgreSQL: Using in-cluster PostgreSQL (deployed via Helm in 03-deploy-osmo-control-plane.sh)"} - + Object Storage: Bucket: ${module.platform.storage_bucket_name} Endpoint: ${module.platform.storage_endpoint} - + ${var.enable_container_registry ? "Container Registry:\n Name: ${module.platform.container_registry_name}\n Endpoint: ${module.platform.container_registry_endpoint}\n Docker login: docker login ${module.platform.container_registry_endpoint}" : "Container Registry: Disabled (set enable_container_registry = true to enable)"} - + EOT } diff --git a/applications/osmo/deploy/001-iac/versions.tf b/applications/osmo/deploy/001-iac/versions.tf index 6042f66dd..a50b60cef 100755 --- a/applications/osmo/deploy/001-iac/versions.tf +++ b/applications/osmo/deploy/001-iac/versions.tf @@ -15,6 +15,10 @@ terraform { source = "dstaroff/units" version = ">= 1.1.1" } + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.20" + } } } @@ -23,3 +27,13 @@ provider "nebius" { } provider "random" {} + +provider "kubernetes" { + host = module.k8s.cluster_endpoint + cluster_ca_certificate = base64decode(module.k8s.cluster_ca_certificate) + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "nebius" + args = ["mk8s", "cluster", "get-credentials", "--id", module.k8s.cluster_id, "--external", "--token-only"] + } +} diff --git a/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh index c4efd0f50..eec524cff 100755 --- a/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh +++ b/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh @@ -464,6 +464,16 @@ KEYCLOAK_HOST="keycloak.${OSMO_NAMESPACE}.svc.cluster.local" KEYCLOAK_URL="http://${KEYCLOAK_HOST}:80" AUTH_DOMAIN="auth-${OSMO_DOMAIN}" +# SSO Identity Provider Configuration (Google and Azure AD) +# Set these environment variables to enable SSO: +# GOOGLE_CLIENT_ID, GOOGLE_CLIENT_SECRET - from Google Cloud Console +# AZURE_CLIENT_ID, AZURE_CLIENT_SECRET, AZURE_TENANT_ID - from Azure Portal +GOOGLE_CLIENT_ID="${GOOGLE_CLIENT_ID:-}" +GOOGLE_CLIENT_SECRET="${GOOGLE_CLIENT_SECRET:-}" +AZURE_CLIENT_ID="${AZURE_CLIENT_ID:-}" +AZURE_CLIENT_SECRET="${AZURE_CLIENT_SECRET:-}" +AZURE_TENANT_ID="${AZURE_TENANT_ID:-common}" + if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then log_info "Deploying Keycloak for OSMO authentication..." log_info "Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak" @@ -787,7 +797,73 @@ spec: "email": "osmo-admin@example.com", "credentials": [{"type":"password","value":"osmo-admin","temporary":false}] }' || echo "User may already exist" - + + # ========================================= + # SSO Identity Providers (Google & Azure AD) + # ========================================= + + # Configure Google Identity Provider (if credentials provided) + GOOGLE_CLIENT_ID="${GOOGLE_CLIENT_ID}" + GOOGLE_CLIENT_SECRET="${GOOGLE_CLIENT_SECRET}" + if [ -n "\$GOOGLE_CLIENT_ID" ] && [ -n "\$GOOGLE_CLIENT_SECRET" ]; then + echo "Configuring Google Identity Provider..." + curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/identity-provider/instances" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "alias": "google", + "displayName": "Google", + "providerId": "google", + "enabled": true, + "trustEmail": true, + "storeToken": false, + "addReadTokenRoleOnCreate": false, + "firstBrokerLoginFlowAlias": "first broker login", + "config": { + "clientId": "'""\$GOOGLE_CLIENT_ID""'", + "clientSecret": "'""\$GOOGLE_CLIENT_SECRET""'", + "defaultScope": "openid email profile", + "syncMode": "IMPORT" + } + }' || echo "Google IdP may already exist" + echo "Google Identity Provider configured" + echo " Redirect URI: \${KEYCLOAK_URL}/realms/osmo/broker/google/endpoint" + else + echo "Skipping Google IdP (GOOGLE_CLIENT_ID/GOOGLE_CLIENT_SECRET not set)" + fi + + # Configure Azure AD (Microsoft) Identity Provider (if credentials provided) + AZURE_CLIENT_ID="${AZURE_CLIENT_ID}" + AZURE_CLIENT_SECRET="${AZURE_CLIENT_SECRET}" + AZURE_TENANT_ID="${AZURE_TENANT_ID}" + if [ -n "\$AZURE_CLIENT_ID" ] && [ -n "\$AZURE_CLIENT_SECRET" ]; then + echo "Configuring Azure AD (Microsoft) Identity Provider..." + curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/identity-provider/instances" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "alias": "microsoft", + "displayName": "Microsoft", + "providerId": "microsoft", + "enabled": true, + "trustEmail": true, + "storeToken": false, + "addReadTokenRoleOnCreate": false, + "firstBrokerLoginFlowAlias": "first broker login", + "config": { + "clientId": "'""\$AZURE_CLIENT_ID""'", + "clientSecret": "'""\$AZURE_CLIENT_SECRET""'", + "tenant": "'""\$AZURE_TENANT_ID""'", + "defaultScope": "openid email profile", + "syncMode": "IMPORT" + } + }' || echo "Microsoft IdP may already exist" + echo "Azure AD Identity Provider configured" + echo " Redirect URI: \${KEYCLOAK_URL}/realms/osmo/broker/microsoft/endpoint" + else + echo "Skipping Azure AD IdP (AZURE_CLIENT_ID/AZURE_CLIENT_SECRET not set)" + fi + echo "" echo "=========================================" echo "Keycloak OSMO configuration complete!" @@ -795,6 +871,12 @@ spec: echo "Realm: osmo" echo "Clients: osmo-device, osmo-browser-flow" echo "Test user: osmo-admin / osmo-admin" + if [ -n "\$GOOGLE_CLIENT_ID" ]; then + echo "Google SSO: Enabled" + fi + if [ -n "\$AZURE_CLIENT_ID" ]; then + echo "Azure AD SSO: Enabled" + fi echo "" EOF @@ -831,6 +913,27 @@ EOF echo " Token: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/token" echo " Auth: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/auth" echo "" + + # Show SSO configuration status + if [[ -n "${GOOGLE_CLIENT_ID}" ]]; then + echo "Google SSO: Enabled" + echo " Redirect URI (for Google Console): http:///realms/osmo/broker/google/endpoint" + fi + if [[ -n "${AZURE_CLIENT_ID}" ]]; then + echo "Azure AD SSO: Enabled (Tenant: ${AZURE_TENANT_ID})" + echo " Redirect URI (for Azure Portal): http:///realms/osmo/broker/microsoft/endpoint" + fi + if [[ -z "${GOOGLE_CLIENT_ID}" && -z "${AZURE_CLIENT_ID}" ]]; then + echo "SSO Identity Providers: Not configured" + echo " To enable SSO, set environment variables before running:" + echo " export GOOGLE_CLIENT_ID=" + echo " export GOOGLE_CLIENT_SECRET=" + echo " export AZURE_CLIENT_ID=" + echo " export AZURE_CLIENT_SECRET=" + echo " export AZURE_TENANT_ID= # or 'common' for multi-tenant" + echo " Then re-run: DEPLOY_KEYCLOAK=true ./03-deploy-osmo-control-plane.sh" + fi + echo "" # Keycloak is deployed but we disable OSMO's internal auth # because OSMO's JWT validation expects its own keys, not Keycloak's @@ -1235,18 +1338,24 @@ log_success "Service ports verified" # ----------------------------------------------------------------------------- # Step 11: Deploy NGINX Proxy # ----------------------------------------------------------------------------- -# The nginx proxy routes traffic to osmo-service, osmo-logger, and osmo-agent +# The nginx proxy routes traffic to osmo-service, osmo-logger, osmo-agent, and osmo-ui # Required for osmo-ctrl sidecar to communicate with the OSMO service -log_info "Deploying OSMO proxy (nginx)..." +# NOTE: If Terraform created the osmo-proxy resources, this step will just verify them -if [[ -f "${SCRIPT_DIR}/nginx-proxy.yaml" ]]; then +if kubectl get deployment osmo-proxy -n "${OSMO_NAMESPACE}" &>/dev/null; then + log_info "OSMO proxy already exists (created by Terraform)" + kubectl rollout status deployment/osmo-proxy -n "${OSMO_NAMESPACE}" --timeout=120s || \ + log_warning "Timeout waiting for osmo-proxy rollout" + log_success "OSMO proxy verified" +elif [[ -f "${SCRIPT_DIR}/nginx-proxy.yaml" ]]; then + log_info "Deploying OSMO proxy (nginx)..." kubectl apply -f "${SCRIPT_DIR}/nginx-proxy.yaml" kubectl rollout status deployment/osmo-proxy -n "${OSMO_NAMESPACE}" --timeout=120s || \ log_warning "Timeout waiting for osmo-proxy rollout" log_success "OSMO proxy deployed" else - log_warning "nginx-proxy.yaml not found - skipping proxy deployment" - log_warning "Workflows may fail without the proxy. Create nginx-proxy.yaml and apply manually." + log_warning "nginx-proxy.yaml not found and osmo-proxy not deployed by Terraform" + log_warning "Workflows may fail without the proxy. Run 'terraform apply' or create nginx-proxy.yaml manually." fi # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/002-setup/SSO-SETUP.md b/applications/osmo/deploy/002-setup/SSO-SETUP.md new file mode 100644 index 000000000..717874f68 --- /dev/null +++ b/applications/osmo/deploy/002-setup/SSO-SETUP.md @@ -0,0 +1,275 @@ +# OSMO SSO Setup with Google and Azure AD via Keycloak + +This document describes the SSO (Single Sign-On) implementation for OSMO using Keycloak as an identity broker with Google and Azure AD as identity providers. + +## Overview + +The implementation enables users to authenticate to OSMO using their Google or Microsoft (Azure AD) accounts instead of local credentials. Keycloak acts as an identity broker that federates authentication to these external identity providers. + +``` +User → OSMO UI → Keycloak → Google OAuth2 + → Azure AD (Microsoft) +``` + +## Prerequisites + +### Google Cloud Console Credentials + +1. Go to [Google Cloud Console](https://console.cloud.google.com/) → APIs & Services → Credentials +2. Create OAuth 2.0 Client ID (Web application) +3. Note down: + - **Client ID**: `xxxxxxxxx.apps.googleusercontent.com` + - **Client Secret**: `GOCSPX-xxxxxxxxx` +4. Add authorized redirect URI (after Keycloak is deployed): + - `http:///realms/osmo/broker/google/endpoint` + +### Azure Portal Credentials + +1. Go to [Azure Portal](https://portal.azure.com/) → Azure Active Directory → App registrations +2. New registration → Name: "OSMO SSO" +3. Note down: + - **Application (client) ID**: `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` + - **Directory (tenant) ID**: `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` +4. Create client secret: Certificates & secrets → New client secret + - **Client Secret**: `xxxxxxxxxxxxxxxxxxxxxxxxx` +5. Add redirect URI: Authentication → Add platform → Web + - `http:///realms/osmo/broker/microsoft/endpoint` + +## Deployment + +### Step 1: Set Environment Variables + +```bash +# Google OAuth2 credentials +export GOOGLE_CLIENT_ID="your-google-client-id.apps.googleusercontent.com" +export GOOGLE_CLIENT_SECRET="GOCSPX-your-secret" + +# Azure AD credentials +export AZURE_CLIENT_ID="your-azure-application-id" +export AZURE_CLIENT_SECRET="your-azure-client-secret" +export AZURE_TENANT_ID="your-tenant-id" # or 'common' for multi-tenant +``` + +### Step 2: Deploy OSMO with Keycloak + +```bash +cd applications/osmo/deploy/002-setup +DEPLOY_KEYCLOAK=true ./03-deploy-osmo-control-plane.sh +``` + +### Step 3: Update Redirect URIs + +After deployment, get the Keycloak URL and update the redirect URIs in Google Cloud Console and Azure Portal: + +**For port-forward access (development):** +```bash +kubectl port-forward -n osmo svc/keycloak 8081:80 +# Keycloak URL: http://localhost:8081 +``` + +**For LoadBalancer access (production):** +```bash +# Get the external IP +kubectl get svc osmo-proxy -n osmo -o jsonpath='{.status.loadBalancer.ingress[0].ip}' +# Keycloak URL: http:///realms/osmo +``` + +Update redirect URIs: +- **Google Console**: `http:///realms/osmo/broker/google/endpoint` +- **Azure Portal**: `http:///realms/osmo/broker/microsoft/endpoint` + +## Files Changed + +### 1. `applications/osmo/deploy/002-setup/nginx-proxy.yaml` + +Added Keycloak upstream and routing for SSO redirect flows. + +**Changes:** +```yaml +# Added upstream for Keycloak +upstream keycloak { + server keycloak.osmo.svc.cluster.local:80; +} + +# Added location blocks for Keycloak paths +location /auth/ { + proxy_pass http://keycloak; +} + +location /realms/ { + proxy_pass http://keycloak; +} + +location /admin/ { + proxy_pass http://keycloak; +} + +location /js/ { + proxy_pass http://keycloak; +} + +location /resources/ { + proxy_pass http://keycloak; +} +``` + +### 2. `applications/osmo/deploy/001-iac/osmo-proxy.tf` + +Same nginx configuration changes as above, keeping Terraform and standalone YAML in sync. + +**Changes:** +- Added `keycloak` upstream server block +- Added location blocks for `/auth/`, `/realms/`, `/admin/`, `/js/`, `/resources/` + +### 3. `applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh` + +Added SSO configuration support. + +**Changes:** + +1. **Environment variable declarations** (lines 467-475): +```bash +# SSO Identity Provider Configuration (Google and Azure AD) +GOOGLE_CLIENT_ID="${GOOGLE_CLIENT_ID:-}" +GOOGLE_CLIENT_SECRET="${GOOGLE_CLIENT_SECRET:-}" +AZURE_CLIENT_ID="${AZURE_CLIENT_ID:-}" +AZURE_CLIENT_SECRET="${AZURE_CLIENT_SECRET:-}" +AZURE_TENANT_ID="${AZURE_TENANT_ID:-common}" +``` + +2. **Google Identity Provider configuration** in Keycloak setup job: +```bash +# Configure Google Identity Provider (if credentials provided) +if [ -n "$GOOGLE_CLIENT_ID" ] && [ -n "$GOOGLE_CLIENT_SECRET" ]; then + curl -s -X POST "${KEYCLOAK_URL}/admin/realms/osmo/identity-provider/instances" \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "alias": "google", + "displayName": "Google", + "providerId": "google", + "enabled": true, + "trustEmail": true, + "config": { + "clientId": "...", + "clientSecret": "...", + "defaultScope": "openid email profile", + "syncMode": "IMPORT" + } + }' +fi +``` + +3. **Azure AD Identity Provider configuration** in Keycloak setup job: +```bash +# Configure Azure AD (Microsoft) Identity Provider (if credentials provided) +if [ -n "$AZURE_CLIENT_ID" ] && [ -n "$AZURE_CLIENT_SECRET" ]; then + curl -s -X POST "${KEYCLOAK_URL}/admin/realms/osmo/identity-provider/instances" \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "alias": "microsoft", + "displayName": "Microsoft", + "providerId": "microsoft", + "enabled": true, + "trustEmail": true, + "config": { + "clientId": "...", + "clientSecret": "...", + "tenant": "...", + "defaultScope": "openid email profile", + "syncMode": "IMPORT" + } + }' +fi +``` + +4. **Output messages** showing SSO status and configuration instructions. + +## Architecture + +### Authentication Flow + +1. User accesses OSMO UI at `http:///` +2. User clicks "Login" and is redirected to Keycloak +3. Keycloak login page shows options: "Login with Google" or "Login with Microsoft" +4. User selects identity provider and authenticates +5. Identity provider redirects back to Keycloak with authorization code +6. Keycloak exchanges code for tokens and creates/links user account +7. User is redirected back to OSMO UI with session established + +### Network Flow + +``` +External Traffic + │ + ▼ +┌──────────────────┐ +│ LoadBalancer │ +│ (osmo-proxy) │ +└────────┬─────────┘ + │ + ▼ +┌──────────────────────────────────────────────────┐ +│ NGINX │ +│ ┌─────────────┐ ┌─────────────┐ ┌───────────┐ │ +│ │ /api/* │ │ /realms/* │ │ /* │ │ +│ │ osmo-service│ │ keycloak │ │ osmo-ui │ │ +│ └─────────────┘ └─────────────┘ └───────────┘ │ +└──────────────────────────────────────────────────┘ +``` + +## Verification + +1. Access OSMO UI: `http:///` +2. Click "Login" +3. Verify Google and Microsoft login options appear +4. Test authentication with each provider +5. Verify user is logged in (not as "guest") + +## Troubleshooting + +### Check Keycloak Logs +```bash +kubectl logs -n osmo -l app.kubernetes.io/name=keycloak --tail=100 +``` + +### Check Identity Provider Configuration +```bash +# Port-forward to Keycloak +kubectl port-forward -n osmo svc/keycloak 8081:80 + +# Access admin console +open http://localhost:8081/admin + +# Login: admin / +kubectl get secret keycloak-admin-secret -n osmo -o jsonpath='{.data.password}' | base64 -d +``` + +### Verify Redirect URIs +In Keycloak Admin Console: +1. Go to Identity Providers → Google/Microsoft +2. Copy the "Redirect URI" shown +3. Ensure this exact URI is configured in Google Cloud Console / Azure Portal + +### Common Issues + +1. **"Invalid redirect_uri" error** + - The redirect URI in Google/Azure doesn't match Keycloak's expected URI + - Copy exact URI from Keycloak Identity Provider settings + +2. **"Login with Google/Microsoft" not showing** + - Identity provider not configured (credentials not set during deployment) + - Re-run deployment with environment variables set + +3. **User created but can't access OSMO** + - Check if user is created in Keycloak (Users section) + - Verify user has appropriate roles assigned + +## Security Considerations + +- Use HTTPS in production (configure TLS on LoadBalancer or Ingress) +- Restrict `trustEmail` if email verification is required +- Consider configuring allowed domains for Google/Azure authentication +- Rotate client secrets periodically +- Use Kubernetes secrets management (e.g., External Secrets Operator) for credentials in production diff --git a/applications/osmo/deploy/002-setup/nginx-proxy.yaml b/applications/osmo/deploy/002-setup/nginx-proxy.yaml index b8eab7837..3bf0ee743 100755 --- a/applications/osmo/deploy/002-setup/nginx-proxy.yaml +++ b/applications/osmo/deploy/002-setup/nginx-proxy.yaml @@ -13,7 +13,7 @@ data: # Logging access_log /dev/stdout; error_log /dev/stderr; - + # Conditional WebSocket support # Sets Connection header to "upgrade" for WebSocket requests, "close" otherwise # This is important for proper handling of both WebSocket and regular HTTP requests @@ -21,52 +21,86 @@ data: default upgrade; '' close; } - + # Upstream servers upstream osmo-service { server osmo-service.osmo.svc.cluster.local:80; } - + upstream osmo-logger { server osmo-logger.osmo.svc.cluster.local:80; } - + upstream osmo-agent { server osmo-agent.osmo.svc.cluster.local:80; } - + + upstream osmo-ui { + server osmo-ui.osmo.svc.cluster.local:80; + } + + upstream keycloak { + server keycloak.osmo.svc.cluster.local:80; + } + server { listen 80; - + # Common proxy headers proxy_http_version 1.1; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; - + # WebSocket support (conditional based on Upgrade header) proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection $connection_upgrade; - + # Timeouts for long-running WebSocket connections (osmo-ctrl logging) proxy_read_timeout 3600s; proxy_send_timeout 3600s; - + # Route /api/logger/* to osmo-logger (WebSocket for log streaming) location /api/logger/ { proxy_pass http://osmo-logger; } - + # Route /api/agent/* to osmo-agent (WebSocket for backend communication) location /api/agent/ { proxy_pass http://osmo-agent; } - - # Everything else to osmo-service (REST API) - location / { + + # Route /api/* to osmo-service (REST API) + location /api/ { proxy_pass http://osmo-service; } + + # Route /auth/* and /realms/* to Keycloak (SSO with Google/Azure AD) + location /auth/ { + proxy_pass http://keycloak; + } + + location /realms/ { + proxy_pass http://keycloak; + } + + location /admin/ { + proxy_pass http://keycloak; + } + + location /js/ { + proxy_pass http://keycloak; + } + + location /resources/ { + proxy_pass http://keycloak; + } + + # Route everything else to osmo-ui (Web UI) + location / { + proxy_pass http://osmo-ui; + } } } --- @@ -117,4 +151,4 @@ spec: ports: - port: 80 targetPort: 80 - type: ClusterIP + type: LoadBalancer From 91f61e617a22360fcda0f58c36aa5cd0ab0abc82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Tue, 3 Feb 2026 10:52:49 -0800 Subject: [PATCH 04/37] - add endpoint - endale Auth --- .../000-prerequisites/nebius-env-init.sh | 110 +++++-- .../deploy/000-prerequisites/secrets-init.sh | 109 ++++++- .../osmo/deploy/001-iac/osmo-proxy.tf | 216 -------------- applications/osmo/deploy/001-iac/outputs.tf | 33 +-- applications/osmo/deploy/001-iac/versions.tf | 14 - .../002-setup/01-deploy-gpu-infrastructure.sh | 9 +- .../002-setup/02-deploy-observability.sh | 9 +- .../002-setup/03-deploy-osmo-control-plane.sh | 133 +-------- .../002-setup/04-deploy-osmo-backend.sh | 3 +- .../deploy/002-setup/05-configure-storage.sh | 9 +- .../002-setup/06-configure-service-url.sh | 9 +- .../002-setup/07-configure-gpu-platform.sh | 9 +- .../osmo/deploy/002-setup/SSO-SETUP.md | 275 ------------------ .../cleanup/uninstall-gpu-infrastructure.sh | 12 +- .../cleanup/uninstall-observability.sh | 40 ++- .../cleanup/uninstall-osmo-backend.sh | 12 +- .../cleanup/uninstall-osmo-control-plane.sh | 12 +- .../osmo/deploy/002-setup/defaults.sh | 11 +- .../osmo/deploy/002-setup/lib/common.sh | 50 ++++ .../osmo/deploy/002-setup/nginx-proxy.yaml | 62 +--- .../deploy/002-setup/values/prometheus.yaml | 2 + modules/gpu-operator-custom/helm.tf | 0 modules/gpu-operator-custom/variables.tf | 0 modules/nims/Readme.md | 1 + modules/nims/bionemo.tf | 0 modules/nims/boltz2.tf | 1 + modules/nims/evo2_40.tf | 117 -------- modules/nims/genmol.tf | 1 + modules/nims/main.tf | 4 +- modules/nims/msa-search.tf | 1 + modules/nims/openfold2.tf | 1 + modules/nims/openfold3.tf | 1 + modules/nims/output.tf | 3 - modules/nims/provider.tf | 0 modules/nims/qwen3-next-80b-a3b-instruct.tf | 0 modules/nims/variables.tf | 1 + 36 files changed, 319 insertions(+), 951 deletions(-) delete mode 100644 applications/osmo/deploy/001-iac/osmo-proxy.tf delete mode 100644 applications/osmo/deploy/002-setup/SSO-SETUP.md mode change 100644 => 100755 modules/gpu-operator-custom/helm.tf mode change 100644 => 100755 modules/gpu-operator-custom/variables.tf mode change 100644 => 100755 modules/nims/Readme.md mode change 100644 => 100755 modules/nims/bionemo.tf mode change 100644 => 100755 modules/nims/boltz2.tf delete mode 100644 modules/nims/evo2_40.tf mode change 100644 => 100755 modules/nims/genmol.tf mode change 100644 => 100755 modules/nims/main.tf mode change 100644 => 100755 modules/nims/msa-search.tf mode change 100644 => 100755 modules/nims/openfold2.tf mode change 100644 => 100755 modules/nims/openfold3.tf delete mode 100644 modules/nims/output.tf mode change 100644 => 100755 modules/nims/provider.tf mode change 100644 => 100755 modules/nims/qwen3-next-80b-a3b-instruct.tf mode change 100644 => 100755 modules/nims/variables.tf diff --git a/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh b/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh index deeacc469..d1d18ed6f 100755 --- a/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh +++ b/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh @@ -27,6 +27,11 @@ is_wsl() { grep -qi microsoft /proc/version 2>/dev/null } +# Check if jq is installed +has_jq() { + command -v jq &>/dev/null +} + # Get Nebius CLI path get_nebius_path() { if command -v nebius &>/dev/null; then @@ -76,25 +81,43 @@ check_nebius_auth() { return 1 } -# Interactive prompt with default value -prompt_with_default() { +# Read input with a prompt into a variable (bash/zsh compatible). +read_prompt_var() { local prompt=$1 - local default=$2 - local var_name=$3 - local value + local var_name=$2 + local default=$3 + local value="" + local read_from="/dev/tty" + local write_to="/dev/tty" + + if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then + read_from="/dev/stdin" + write_to="/dev/stdout" + fi if [[ -n "$default" ]]; then - printf "%s [%s]: " "$prompt" "$default" - read value - value=${value:-$default} + printf "%s [%s]: " "$prompt" "$default" >"$write_to" else - printf "%s: " "$prompt" - read value + printf "%s: " "$prompt" >"$write_to" + fi + + IFS= read -r value <"$read_from" + if [[ -z "$value" && -n "$default" ]]; then + value="$default" fi eval "$var_name='$value'" } +# Interactive prompt with default value +prompt_with_default() { + local prompt=$1 + local default=$2 + local var_name=$3 + + read_prompt_var "$prompt" "$var_name" "$default" +} + # List existing projects in a tenant list_projects() { local tenant_id=$1 @@ -164,18 +187,15 @@ select_or_create_project() { echo "" local choice - printf "Choose option [1/2/3]: " - read choice + read_prompt_var "Choose option [1/2/3]" choice "" case $choice in 1) - printf "Enter Project ID: " - read NEBIUS_PROJECT_ID + read_prompt_var "Enter Project ID" NEBIUS_PROJECT_ID "" ;; 2) local project_name - printf "Enter new project name: " - read project_name + read_prompt_var "Enter new project name" project_name "" if [[ -z "$project_name" ]]; then echo -e "${RED}[ERROR]${NC} Project name cannot be empty" @@ -198,13 +218,11 @@ select_or_create_project() { 3) list_projects "$tenant_id" echo "" - printf "Enter Project ID from the list above (or 'new' to create): " - read input + read_prompt_var "Enter Project ID from the list above (or 'new' to create)" input "" if [[ "$input" == "new" ]]; then local project_name - printf "Enter new project name: " - read project_name + read_prompt_var "Enter new project name" project_name "" if [[ -z "$project_name" ]]; then echo -e "${RED}[ERROR]${NC} Project name cannot be empty" @@ -272,18 +290,52 @@ main() { local current_project="${NEBIUS_PROJECT_ID:-}" local current_region="${NEBIUS_REGION:-eu-north1}" + # Sanitize previously set values in case they were corrupted by a failed prompt + if [[ -n "$current_tenant" && ! "$current_tenant" =~ ^tenant-[a-z0-9]+$ ]]; then + current_tenant="" + fi + if [[ -n "$current_project" && ! "$current_project" =~ ^project-[a-z0-9]+$ ]]; then + current_project="" + fi + # Try to list tenants to help user find their tenant ID echo "Fetching available tenants..." local tenants=$("$nebius_path" iam tenant list --format json 2>/dev/null) if [[ -n "$tenants" && "$tenants" != "[]" ]]; then echo "" echo "Available tenants:" - echo "$tenants" | jq -r '.[] | " - \(.metadata.name): \(.metadata.id)"' 2>/dev/null || true - # Auto-detect if only one tenant - local tenant_count=$(echo "$tenants" | jq -r 'length' 2>/dev/null || echo "0") - if [[ "$tenant_count" == "1" && -z "$current_tenant" ]]; then - current_tenant=$(echo "$tenants" | jq -r '.[0].metadata.id' 2>/dev/null) - echo -e "${GREEN}[✓]${NC} Auto-detected tenant: $current_tenant" + if has_jq; then + local page_token="" + local total_count=0 + local last_tenant_id="" + while :; do + if [[ -n "$page_token" ]]; then + tenants=$("$nebius_path" iam tenant list --format json --page-token "$page_token" 2>/dev/null) + else + tenants=$("$nebius_path" iam tenant list --format json 2>/dev/null) + fi + + echo "$tenants" | jq -r '.items // . | map(select(.metadata.name | startswith("billing-test") | not)) | .[] | " - \(.metadata.name): \(.metadata.id)"' 2>/dev/null || true + local page_count + page_count=$(echo "$tenants" | jq -r '(.items // .) | map(select(.metadata.name | startswith("billing-test") | not)) | length' 2>/dev/null || echo "0") + total_count=$((total_count + page_count)) + if [[ "$page_count" -gt 0 ]]; then + last_tenant_id=$(echo "$tenants" | jq -r '(.items // .) | map(select(.metadata.name | startswith("billing-test") | not)) | .[-1].metadata.id' 2>/dev/null) + fi + + page_token=$(echo "$tenants" | jq -r '.next_page_token // empty' 2>/dev/null) + if [[ -z "$page_token" ]]; then + break + fi + done + + # Auto-detect if only one tenant across all pages + if [[ "$total_count" == "1" && -z "$current_tenant" ]]; then + current_tenant="$last_tenant_id" + echo -e "${GREEN}[✓]${NC} Auto-detected tenant: $current_tenant" + fi + else + echo " (jq not found; run 'brew install jq' to show tenants)" fi fi @@ -317,8 +369,7 @@ main() { else echo "" echo "Current project: $current_project" - printf "Use this project? (Y/n/new): " - read use_current + read_prompt_var "Use this project? (Y/n/new)" use_current "" case $use_current in n|N) @@ -328,8 +379,7 @@ main() { ;; new) local project_name - printf "Enter new project name: " - read project_name + read_prompt_var "Enter new project name" project_name "" NEBIUS_PROJECT_ID=$(create_project "$NEBIUS_TENANT_ID" "$project_name") if [[ $? -ne 0 || -z "$NEBIUS_PROJECT_ID" ]]; then return 1 diff --git a/applications/osmo/deploy/000-prerequisites/secrets-init.sh b/applications/osmo/deploy/000-prerequisites/secrets-init.sh index 534c2c389..8a37f4203 100755 --- a/applications/osmo/deploy/000-prerequisites/secrets-init.sh +++ b/applications/osmo/deploy/000-prerequisites/secrets-init.sh @@ -36,6 +36,99 @@ echo "" # Helper Functions # ----------------------------------------------------------------------------- +# Read input with a prompt into a variable (bash/zsh compatible). +read_prompt_var() { + local prompt=$1 + local var_name=$2 + local default=$3 + local value="" + local read_from="/dev/tty" + local write_to="/dev/tty" + + if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then + read_from="/dev/stdin" + write_to="/dev/stdout" + fi + + if [[ -n "$default" ]]; then + printf "%s [%s]: " "$prompt" "$default" >"$write_to" + else + printf "%s: " "$prompt" >"$write_to" + fi + + IFS= read -r value <"$read_from" + if [[ -z "$value" && -n "$default" ]]; then + value="$default" + fi + + eval "$var_name='$value'" +} + +# Return a random integer in range [min, max] using /dev/urandom. +rand_int() { + local min=$1 + local max=$2 + local range=$((max - min + 1)) + local num="" + + while :; do + num=$(od -An -N2 -tu2 /dev/urandom | tr -d ' ') + if [[ -n "$num" ]]; then + echo $((min + num % range)) + return 0 + fi + done +} + +# Pick a random character from a set. +rand_char_from_set() { + local set=$1 + local idx + idx=$(rand_int 0 $((${#set} - 1))) + printf "%s" "${set:$idx:1}" +} + +# Shuffle a string using Fisher-Yates. +shuffle_string() { + local input=$1 + local -a chars + local i j tmp + local len=${#input} + + if [[ -n "${BASH_VERSION:-}" ]]; then + for ((i = 0; i < len; i++)); do + chars[i]="${input:i:1}" + done + for ((i = len - 1; i > 0; i--)); do + j=$(rand_int 0 "$i") + tmp="${chars[i]}" + chars[i]="${chars[j]}" + chars[j]="$tmp" + done + local out="" + for ((i = 0; i < len; i++)); do + out+="${chars[i]}" + done + printf "%s" "$out" + else + # zsh uses 1-based indexing for arrays and string subscripts + for ((i = 1; i <= len; i++)); do + chars[i]="${input[$i]}" + done + for ((i = len; i > 1; i--)); do + j=$(rand_int 1 "$i") + tmp="${chars[i]}" + chars[i]="${chars[j]}" + chars[j]="$tmp" + done + local out="" + for ((i = 1; i <= len; i++)); do + out+="${chars[i]}" + done + printf "%s" "$out" + fi +} + get_nebius_path() { if command -v nebius &>/dev/null; then command -v nebius @@ -99,15 +192,15 @@ generate_postgresql_password() { password=$(openssl rand -base64 32 | tr -d '/+=\n' | head -c 28) # Add required character types - local lower=$(echo "abcdefghijklmnopqrstuvwxyz" | fold -w1 | shuf | head -1) - local upper=$(echo "ABCDEFGHIJKLMNOPQRSTUVWXYZ" | fold -w1 | shuf | head -1) - local digit=$(echo "0123456789" | fold -w1 | shuf | head -1) - local special=$(echo '!#$^&*()-_=+' | fold -w1 | shuf | head -1) + local lower=$(rand_char_from_set "abcdefghijklmnopqrstuvwxyz") + local upper=$(rand_char_from_set "ABCDEFGHIJKLMNOPQRSTUVWXYZ") + local digit=$(rand_char_from_set "0123456789") + local special=$(rand_char_from_set '!#$^&*()-_=+') password="${password}${lower}${upper}${digit}${special}" # Shuffle the password - password=$(echo "$password" | fold -w1 | shuf | tr -d '\n') + password=$(shuffle_string "$password") # Verify requirements if [[ ${#password} -ge 32 ]] && \ @@ -233,8 +326,7 @@ create_postgresql_secret() { if [[ -n "$existing_id" ]]; then echo -e "${YELLOW}[!]${NC} Secret '$POSTGRESQL_SECRET_NAME' already exists (ID: $existing_id)" - printf " Replace existing secret? (y/N): " - read replace + read_prompt_var " Replace existing secret? (y/N)" replace "" if [[ "$replace" =~ ^[Yy]$ ]]; then echo " Deleting existing secret..." delete_secret "$existing_id" @@ -279,8 +371,7 @@ create_mek_secret() { if [[ -n "$existing_id" ]]; then echo -e "${YELLOW}[!]${NC} Secret '$MEK_SECRET_NAME' already exists (ID: $existing_id)" - printf " Replace existing secret? (y/N): " - read replace + read_prompt_var " Replace existing secret? (y/N)" replace "" if [[ "$replace" =~ ^[Yy]$ ]]; then echo " Deleting existing secret..." delete_secret "$existing_id" diff --git a/applications/osmo/deploy/001-iac/osmo-proxy.tf b/applications/osmo/deploy/001-iac/osmo-proxy.tf deleted file mode 100644 index e50af6e03..000000000 --- a/applications/osmo/deploy/001-iac/osmo-proxy.tf +++ /dev/null @@ -1,216 +0,0 @@ -# ============================================================================= -# OSMO Proxy LoadBalancer Service -# ============================================================================= -# Creates the OSMO namespace and LoadBalancer service in Terraform so that -# the external IP can be output. The nginx deployment is created by the -# shell scripts in 002-setup. - -# ----------------------------------------------------------------------------- -# OSMO Namespace -# ----------------------------------------------------------------------------- -resource "kubernetes_namespace_v1" "osmo" { - metadata { - name = "osmo" - } - - depends_on = [module.k8s] -} - -# ----------------------------------------------------------------------------- -# OSMO Proxy ConfigMap (nginx configuration) -# ----------------------------------------------------------------------------- -resource "kubernetes_config_map_v1" "osmo_proxy_nginx" { - metadata { - name = "osmo-proxy-nginx-config" - namespace = kubernetes_namespace_v1.osmo.metadata[0].name - } - - data = { - "nginx.conf" = <<-EOF - events { - worker_connections 1024; - } - - http { - # Logging - access_log /dev/stdout; - error_log /dev/stderr; - - # Conditional WebSocket support - map $http_upgrade $connection_upgrade { - default upgrade; - '' close; - } - - # Upstream servers - upstream osmo-service { - server osmo-service.osmo.svc.cluster.local:80; - } - - upstream osmo-logger { - server osmo-logger.osmo.svc.cluster.local:80; - } - - upstream osmo-agent { - server osmo-agent.osmo.svc.cluster.local:80; - } - - upstream osmo-ui { - server osmo-ui.osmo.svc.cluster.local:80; - } - - upstream keycloak { - server keycloak.osmo.svc.cluster.local:80; - } - - server { - listen 80; - - # Common proxy headers - proxy_http_version 1.1; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - - # WebSocket support - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection $connection_upgrade; - - # Timeouts for long-running WebSocket connections - proxy_read_timeout 3600s; - proxy_send_timeout 3600s; - - # Route /api/logger/* to osmo-logger (WebSocket for log streaming) - location /api/logger/ { - proxy_pass http://osmo-logger; - } - - # Route /api/agent/* to osmo-agent (WebSocket for backend communication) - location /api/agent/ { - proxy_pass http://osmo-agent; - } - - # Route /api/* to osmo-service (REST API) - location /api/ { - proxy_pass http://osmo-service; - } - - # Route /auth/* and /realms/* to Keycloak (SSO with Google/Azure AD) - location /auth/ { - proxy_pass http://keycloak; - } - - location /realms/ { - proxy_pass http://keycloak; - } - - location /admin/ { - proxy_pass http://keycloak; - } - - location /js/ { - proxy_pass http://keycloak; - } - - location /resources/ { - proxy_pass http://keycloak; - } - - # Route everything else to osmo-ui (Web UI) - location / { - proxy_pass http://osmo-ui; - } - } - } - EOF - } -} - -# ----------------------------------------------------------------------------- -# OSMO Proxy Deployment -# ----------------------------------------------------------------------------- -resource "kubernetes_deployment_v1" "osmo_proxy" { - metadata { - name = "osmo-proxy" - namespace = kubernetes_namespace_v1.osmo.metadata[0].name - } - - spec { - replicas = 1 - - selector { - match_labels = { - app = "osmo-proxy" - } - } - - template { - metadata { - labels = { - app = "osmo-proxy" - } - } - - spec { - container { - name = "nginx" - image = "nginx:alpine" - - port { - container_port = 80 - } - - volume_mount { - name = "nginx-config" - mount_path = "/etc/nginx/nginx.conf" - sub_path = "nginx.conf" - } - - resources { - requests = { - cpu = "50m" - memory = "64Mi" - } - limits = { - cpu = "200m" - memory = "128Mi" - } - } - } - - volume { - name = "nginx-config" - config_map { - name = kubernetes_config_map_v1.osmo_proxy_nginx.metadata[0].name - } - } - } - } - } -} - -# ----------------------------------------------------------------------------- -# OSMO Proxy LoadBalancer Service -# ----------------------------------------------------------------------------- -resource "kubernetes_service_v1" "osmo_proxy" { - metadata { - name = "osmo-proxy" - namespace = kubernetes_namespace_v1.osmo.metadata[0].name - } - - spec { - selector = { - app = "osmo-proxy" - } - - port { - port = 80 - target_port = 80 - } - - type = "LoadBalancer" - } - - depends_on = [kubernetes_deployment_v1.osmo_proxy] -} diff --git a/applications/osmo/deploy/001-iac/outputs.tf b/applications/osmo/deploy/001-iac/outputs.tf index 4808125a4..101b52b25 100755 --- a/applications/osmo/deploy/001-iac/outputs.tf +++ b/applications/osmo/deploy/001-iac/outputs.tf @@ -132,54 +132,37 @@ output "wireguard" { } : null } -# ----------------------------------------------------------------------------- -# OSMO Proxy Outputs -# ----------------------------------------------------------------------------- -output "osmo_proxy" { - description = "OSMO Proxy LoadBalancer details" - value = { - external_ip = try(kubernetes_service_v1.osmo_proxy.status[0].load_balancer[0].ingress[0].ip, null) - url = try("http://${kubernetes_service_v1.osmo_proxy.status[0].load_balancer[0].ingress[0].ip}", null) - ui_url = try("http://${kubernetes_service_v1.osmo_proxy.status[0].load_balancer[0].ingress[0].ip}/", null) - api_url = try("http://${kubernetes_service_v1.osmo_proxy.status[0].load_balancer[0].ingress[0].ip}/api/", null) - } -} - # ----------------------------------------------------------------------------- # Connection Instructions # ----------------------------------------------------------------------------- output "next_steps" { description = "Next steps after deployment" value = <<-EOT - + ======================================== OSMO on Nebius - Deployment Complete ======================================== - + 1. Get Kubernetes credentials: nebius mk8s cluster get-credentials --id ${module.k8s.cluster_id} --external - + 2. Verify cluster access: kubectl get nodes - + ${var.enable_wireguard ? "3. Set up WireGuard VPN:\n cd ../000-prerequisites && ./wireguard-client-setup.sh\n WireGuard UI: ${module.wireguard[0].ui_url}\n \n 4. " : "3. "}Deploy OSMO components: cd ../002-setup ./01-deploy-gpu-infrastructure.sh ./02-deploy-observability.sh ./03-deploy-osmo-control-plane.sh ./04-deploy-osmo-backend.sh - - OSMO Access: - UI: ${try("http://${kubernetes_service_v1.osmo_proxy.status[0].load_balancer[0].ingress[0].ip}/", "(LoadBalancer IP pending)")} - API: ${try("http://${kubernetes_service_v1.osmo_proxy.status[0].load_balancer[0].ingress[0].ip}/api/", "(LoadBalancer IP pending)")} - + ${var.enable_managed_postgresql ? "PostgreSQL Connection (Managed):\n Host: ${module.platform.postgresql_host}\n Port: ${module.platform.postgresql_port}\n Database: ${module.platform.postgresql_database}\n Username: ${module.platform.postgresql_username}" : "PostgreSQL: Using in-cluster PostgreSQL (deployed via Helm in 03-deploy-osmo-control-plane.sh)"} - + Object Storage: Bucket: ${module.platform.storage_bucket_name} Endpoint: ${module.platform.storage_endpoint} - + ${var.enable_container_registry ? "Container Registry:\n Name: ${module.platform.container_registry_name}\n Endpoint: ${module.platform.container_registry_endpoint}\n Docker login: docker login ${module.platform.container_registry_endpoint}" : "Container Registry: Disabled (set enable_container_registry = true to enable)"} - + EOT } diff --git a/applications/osmo/deploy/001-iac/versions.tf b/applications/osmo/deploy/001-iac/versions.tf index a50b60cef..6042f66dd 100755 --- a/applications/osmo/deploy/001-iac/versions.tf +++ b/applications/osmo/deploy/001-iac/versions.tf @@ -15,10 +15,6 @@ terraform { source = "dstaroff/units" version = ">= 1.1.1" } - kubernetes = { - source = "hashicorp/kubernetes" - version = ">= 2.20" - } } } @@ -27,13 +23,3 @@ provider "nebius" { } provider "random" {} - -provider "kubernetes" { - host = module.k8s.cluster_endpoint - cluster_ca_certificate = base64decode(module.k8s.cluster_ca_certificate) - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "nebius" - args = ["mk8s", "cluster", "get-credentials", "--id", module.k8s.cluster_id, "--external", "--token-only"] - } -} diff --git a/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh b/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh index d4b9b04ad..58e6f1241 100755 --- a/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh +++ b/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh @@ -5,14 +5,7 @@ set -e -# Determine script directory (works in bash and zsh) -if [[ -n "${BASH_SOURCE[0]:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -elif [[ -n "${ZSH_VERSION:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" -else - SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -fi +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" diff --git a/applications/osmo/deploy/002-setup/02-deploy-observability.sh b/applications/osmo/deploy/002-setup/02-deploy-observability.sh index 38c4692bc..ef3c22f13 100755 --- a/applications/osmo/deploy/002-setup/02-deploy-observability.sh +++ b/applications/osmo/deploy/002-setup/02-deploy-observability.sh @@ -5,14 +5,7 @@ set -e -# Determine script directory (works in bash and zsh) -if [[ -n "${BASH_SOURCE[0]:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -elif [[ -n "${ZSH_VERSION:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" -else - SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -fi +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" diff --git a/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh index eec524cff..117108cd9 100755 --- a/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh +++ b/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh @@ -292,14 +292,14 @@ EOF # Poll for completion - init pods go directly to Completed/Succeeded very quickly max_wait=120 elapsed=0 - status="" + pod_status="" while [[ $elapsed -lt $max_wait ]]; do - status=$(kubectl get pod osmo-db-init -n "${OSMO_NAMESPACE}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Pending") + pod_status=$(kubectl get pod osmo-db-init -n "${OSMO_NAMESPACE}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Pending") - if [[ "$status" == "Succeeded" ]]; then + if [[ "$pod_status" == "Succeeded" ]]; then break - elif [[ "$status" == "Failed" ]]; then + elif [[ "$pod_status" == "Failed" ]]; then log_error "Database initialization failed. Checking logs..." kubectl logs osmo-db-init -n "${OSMO_NAMESPACE}" kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found @@ -310,8 +310,8 @@ EOF ((elapsed += 2)) done - if [[ "$status" != "Succeeded" ]]; then - log_error "Database initialization timed out (status: $status). Checking logs..." + if [[ "$pod_status" != "Succeeded" ]]; then + log_error "Database initialization timed out (status: $pod_status). Checking logs..." kubectl logs osmo-db-init -n "${OSMO_NAMESPACE}" 2>/dev/null || true kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found exit 1 @@ -464,16 +464,6 @@ KEYCLOAK_HOST="keycloak.${OSMO_NAMESPACE}.svc.cluster.local" KEYCLOAK_URL="http://${KEYCLOAK_HOST}:80" AUTH_DOMAIN="auth-${OSMO_DOMAIN}" -# SSO Identity Provider Configuration (Google and Azure AD) -# Set these environment variables to enable SSO: -# GOOGLE_CLIENT_ID, GOOGLE_CLIENT_SECRET - from Google Cloud Console -# AZURE_CLIENT_ID, AZURE_CLIENT_SECRET, AZURE_TENANT_ID - from Azure Portal -GOOGLE_CLIENT_ID="${GOOGLE_CLIENT_ID:-}" -GOOGLE_CLIENT_SECRET="${GOOGLE_CLIENT_SECRET:-}" -AZURE_CLIENT_ID="${AZURE_CLIENT_ID:-}" -AZURE_CLIENT_SECRET="${AZURE_CLIENT_SECRET:-}" -AZURE_TENANT_ID="${AZURE_TENANT_ID:-common}" - if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then log_info "Deploying Keycloak for OSMO authentication..." log_info "Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak" @@ -797,73 +787,7 @@ spec: "email": "osmo-admin@example.com", "credentials": [{"type":"password","value":"osmo-admin","temporary":false}] }' || echo "User may already exist" - - # ========================================= - # SSO Identity Providers (Google & Azure AD) - # ========================================= - - # Configure Google Identity Provider (if credentials provided) - GOOGLE_CLIENT_ID="${GOOGLE_CLIENT_ID}" - GOOGLE_CLIENT_SECRET="${GOOGLE_CLIENT_SECRET}" - if [ -n "\$GOOGLE_CLIENT_ID" ] && [ -n "\$GOOGLE_CLIENT_SECRET" ]; then - echo "Configuring Google Identity Provider..." - curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/identity-provider/instances" \ - -H "Authorization: Bearer \$TOKEN" \ - -H "Content-Type: application/json" \ - -d '{ - "alias": "google", - "displayName": "Google", - "providerId": "google", - "enabled": true, - "trustEmail": true, - "storeToken": false, - "addReadTokenRoleOnCreate": false, - "firstBrokerLoginFlowAlias": "first broker login", - "config": { - "clientId": "'""\$GOOGLE_CLIENT_ID""'", - "clientSecret": "'""\$GOOGLE_CLIENT_SECRET""'", - "defaultScope": "openid email profile", - "syncMode": "IMPORT" - } - }' || echo "Google IdP may already exist" - echo "Google Identity Provider configured" - echo " Redirect URI: \${KEYCLOAK_URL}/realms/osmo/broker/google/endpoint" - else - echo "Skipping Google IdP (GOOGLE_CLIENT_ID/GOOGLE_CLIENT_SECRET not set)" - fi - - # Configure Azure AD (Microsoft) Identity Provider (if credentials provided) - AZURE_CLIENT_ID="${AZURE_CLIENT_ID}" - AZURE_CLIENT_SECRET="${AZURE_CLIENT_SECRET}" - AZURE_TENANT_ID="${AZURE_TENANT_ID}" - if [ -n "\$AZURE_CLIENT_ID" ] && [ -n "\$AZURE_CLIENT_SECRET" ]; then - echo "Configuring Azure AD (Microsoft) Identity Provider..." - curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/identity-provider/instances" \ - -H "Authorization: Bearer \$TOKEN" \ - -H "Content-Type: application/json" \ - -d '{ - "alias": "microsoft", - "displayName": "Microsoft", - "providerId": "microsoft", - "enabled": true, - "trustEmail": true, - "storeToken": false, - "addReadTokenRoleOnCreate": false, - "firstBrokerLoginFlowAlias": "first broker login", - "config": { - "clientId": "'""\$AZURE_CLIENT_ID""'", - "clientSecret": "'""\$AZURE_CLIENT_SECRET""'", - "tenant": "'""\$AZURE_TENANT_ID""'", - "defaultScope": "openid email profile", - "syncMode": "IMPORT" - } - }' || echo "Microsoft IdP may already exist" - echo "Azure AD Identity Provider configured" - echo " Redirect URI: \${KEYCLOAK_URL}/realms/osmo/broker/microsoft/endpoint" - else - echo "Skipping Azure AD IdP (AZURE_CLIENT_ID/AZURE_CLIENT_SECRET not set)" - fi - + echo "" echo "=========================================" echo "Keycloak OSMO configuration complete!" @@ -871,12 +795,6 @@ spec: echo "Realm: osmo" echo "Clients: osmo-device, osmo-browser-flow" echo "Test user: osmo-admin / osmo-admin" - if [ -n "\$GOOGLE_CLIENT_ID" ]; then - echo "Google SSO: Enabled" - fi - if [ -n "\$AZURE_CLIENT_ID" ]; then - echo "Azure AD SSO: Enabled" - fi echo "" EOF @@ -913,27 +831,6 @@ EOF echo " Token: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/token" echo " Auth: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/auth" echo "" - - # Show SSO configuration status - if [[ -n "${GOOGLE_CLIENT_ID}" ]]; then - echo "Google SSO: Enabled" - echo " Redirect URI (for Google Console): http:///realms/osmo/broker/google/endpoint" - fi - if [[ -n "${AZURE_CLIENT_ID}" ]]; then - echo "Azure AD SSO: Enabled (Tenant: ${AZURE_TENANT_ID})" - echo " Redirect URI (for Azure Portal): http:///realms/osmo/broker/microsoft/endpoint" - fi - if [[ -z "${GOOGLE_CLIENT_ID}" && -z "${AZURE_CLIENT_ID}" ]]; then - echo "SSO Identity Providers: Not configured" - echo " To enable SSO, set environment variables before running:" - echo " export GOOGLE_CLIENT_ID=" - echo " export GOOGLE_CLIENT_SECRET=" - echo " export AZURE_CLIENT_ID=" - echo " export AZURE_CLIENT_SECRET=" - echo " export AZURE_TENANT_ID= # or 'common' for multi-tenant" - echo " Then re-run: DEPLOY_KEYCLOAK=true ./03-deploy-osmo-control-plane.sh" - fi - echo "" # Keycloak is deployed but we disable OSMO's internal auth # because OSMO's JWT validation expects its own keys, not Keycloak's @@ -1338,24 +1235,18 @@ log_success "Service ports verified" # ----------------------------------------------------------------------------- # Step 11: Deploy NGINX Proxy # ----------------------------------------------------------------------------- -# The nginx proxy routes traffic to osmo-service, osmo-logger, osmo-agent, and osmo-ui +# The nginx proxy routes traffic to osmo-service, osmo-logger, and osmo-agent # Required for osmo-ctrl sidecar to communicate with the OSMO service -# NOTE: If Terraform created the osmo-proxy resources, this step will just verify them +log_info "Deploying OSMO proxy (nginx)..." -if kubectl get deployment osmo-proxy -n "${OSMO_NAMESPACE}" &>/dev/null; then - log_info "OSMO proxy already exists (created by Terraform)" - kubectl rollout status deployment/osmo-proxy -n "${OSMO_NAMESPACE}" --timeout=120s || \ - log_warning "Timeout waiting for osmo-proxy rollout" - log_success "OSMO proxy verified" -elif [[ -f "${SCRIPT_DIR}/nginx-proxy.yaml" ]]; then - log_info "Deploying OSMO proxy (nginx)..." +if [[ -f "${SCRIPT_DIR}/nginx-proxy.yaml" ]]; then kubectl apply -f "${SCRIPT_DIR}/nginx-proxy.yaml" kubectl rollout status deployment/osmo-proxy -n "${OSMO_NAMESPACE}" --timeout=120s || \ log_warning "Timeout waiting for osmo-proxy rollout" log_success "OSMO proxy deployed" else - log_warning "nginx-proxy.yaml not found and osmo-proxy not deployed by Terraform" - log_warning "Workflows may fail without the proxy. Run 'terraform apply' or create nginx-proxy.yaml manually." + log_warning "nginx-proxy.yaml not found - skipping proxy deployment" + log_warning "Workflows may fail without the proxy. Create nginx-proxy.yaml and apply manually." fi # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh b/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh index 5e50c55df..8a57177bf 100755 --- a/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh +++ b/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh @@ -147,8 +147,7 @@ if [[ -z "${OSMO_SERVICE_TOKEN:-}" ]]; then --service --roles osmo-backend 2>&1) # Extract token from output (format: "Access token: ") - # Note: Using sed instead of grep -P for macOS compatibility - OSMO_SERVICE_TOKEN=$(echo "$TOKEN_OUTPUT" | sed -n 's/.*Access token: \(.*\)/\1/p' || echo "") + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_OUTPUT" | grep -oP 'Access token: \K.*' || echo "") if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then log_error "Failed to create service token" diff --git a/applications/osmo/deploy/002-setup/05-configure-storage.sh b/applications/osmo/deploy/002-setup/05-configure-storage.sh index 843897699..47e7a9d53 100755 --- a/applications/osmo/deploy/002-setup/05-configure-storage.sh +++ b/applications/osmo/deploy/002-setup/05-configure-storage.sh @@ -6,14 +6,7 @@ set -e -# Determine script directory (works in bash and zsh) -if [[ -n "${BASH_SOURCE[0]:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -elif [[ -n "${ZSH_VERSION:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" -else - SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -fi +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" echo "" diff --git a/applications/osmo/deploy/002-setup/06-configure-service-url.sh b/applications/osmo/deploy/002-setup/06-configure-service-url.sh index de73f55c8..76c4ee481 100755 --- a/applications/osmo/deploy/002-setup/06-configure-service-url.sh +++ b/applications/osmo/deploy/002-setup/06-configure-service-url.sh @@ -6,14 +6,7 @@ set -e -# Determine script directory (works in bash and zsh) -if [[ -n "${BASH_SOURCE[0]:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -elif [[ -n "${ZSH_VERSION:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" -else - SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -fi +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" echo "" diff --git a/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh b/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh index 6c8be3a71..aa371b74f 100755 --- a/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh +++ b/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh @@ -4,14 +4,7 @@ set -e -# Determine script directory (works in bash and zsh) -if [[ -n "${BASH_SOURCE[0]:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -elif [[ -n "${ZSH_VERSION:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" -else - SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -fi +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" OSMO_URL="${OSMO_URL:-http://localhost:8080}" diff --git a/applications/osmo/deploy/002-setup/SSO-SETUP.md b/applications/osmo/deploy/002-setup/SSO-SETUP.md deleted file mode 100644 index 717874f68..000000000 --- a/applications/osmo/deploy/002-setup/SSO-SETUP.md +++ /dev/null @@ -1,275 +0,0 @@ -# OSMO SSO Setup with Google and Azure AD via Keycloak - -This document describes the SSO (Single Sign-On) implementation for OSMO using Keycloak as an identity broker with Google and Azure AD as identity providers. - -## Overview - -The implementation enables users to authenticate to OSMO using their Google or Microsoft (Azure AD) accounts instead of local credentials. Keycloak acts as an identity broker that federates authentication to these external identity providers. - -``` -User → OSMO UI → Keycloak → Google OAuth2 - → Azure AD (Microsoft) -``` - -## Prerequisites - -### Google Cloud Console Credentials - -1. Go to [Google Cloud Console](https://console.cloud.google.com/) → APIs & Services → Credentials -2. Create OAuth 2.0 Client ID (Web application) -3. Note down: - - **Client ID**: `xxxxxxxxx.apps.googleusercontent.com` - - **Client Secret**: `GOCSPX-xxxxxxxxx` -4. Add authorized redirect URI (after Keycloak is deployed): - - `http:///realms/osmo/broker/google/endpoint` - -### Azure Portal Credentials - -1. Go to [Azure Portal](https://portal.azure.com/) → Azure Active Directory → App registrations -2. New registration → Name: "OSMO SSO" -3. Note down: - - **Application (client) ID**: `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` - - **Directory (tenant) ID**: `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` -4. Create client secret: Certificates & secrets → New client secret - - **Client Secret**: `xxxxxxxxxxxxxxxxxxxxxxxxx` -5. Add redirect URI: Authentication → Add platform → Web - - `http:///realms/osmo/broker/microsoft/endpoint` - -## Deployment - -### Step 1: Set Environment Variables - -```bash -# Google OAuth2 credentials -export GOOGLE_CLIENT_ID="your-google-client-id.apps.googleusercontent.com" -export GOOGLE_CLIENT_SECRET="GOCSPX-your-secret" - -# Azure AD credentials -export AZURE_CLIENT_ID="your-azure-application-id" -export AZURE_CLIENT_SECRET="your-azure-client-secret" -export AZURE_TENANT_ID="your-tenant-id" # or 'common' for multi-tenant -``` - -### Step 2: Deploy OSMO with Keycloak - -```bash -cd applications/osmo/deploy/002-setup -DEPLOY_KEYCLOAK=true ./03-deploy-osmo-control-plane.sh -``` - -### Step 3: Update Redirect URIs - -After deployment, get the Keycloak URL and update the redirect URIs in Google Cloud Console and Azure Portal: - -**For port-forward access (development):** -```bash -kubectl port-forward -n osmo svc/keycloak 8081:80 -# Keycloak URL: http://localhost:8081 -``` - -**For LoadBalancer access (production):** -```bash -# Get the external IP -kubectl get svc osmo-proxy -n osmo -o jsonpath='{.status.loadBalancer.ingress[0].ip}' -# Keycloak URL: http:///realms/osmo -``` - -Update redirect URIs: -- **Google Console**: `http:///realms/osmo/broker/google/endpoint` -- **Azure Portal**: `http:///realms/osmo/broker/microsoft/endpoint` - -## Files Changed - -### 1. `applications/osmo/deploy/002-setup/nginx-proxy.yaml` - -Added Keycloak upstream and routing for SSO redirect flows. - -**Changes:** -```yaml -# Added upstream for Keycloak -upstream keycloak { - server keycloak.osmo.svc.cluster.local:80; -} - -# Added location blocks for Keycloak paths -location /auth/ { - proxy_pass http://keycloak; -} - -location /realms/ { - proxy_pass http://keycloak; -} - -location /admin/ { - proxy_pass http://keycloak; -} - -location /js/ { - proxy_pass http://keycloak; -} - -location /resources/ { - proxy_pass http://keycloak; -} -``` - -### 2. `applications/osmo/deploy/001-iac/osmo-proxy.tf` - -Same nginx configuration changes as above, keeping Terraform and standalone YAML in sync. - -**Changes:** -- Added `keycloak` upstream server block -- Added location blocks for `/auth/`, `/realms/`, `/admin/`, `/js/`, `/resources/` - -### 3. `applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh` - -Added SSO configuration support. - -**Changes:** - -1. **Environment variable declarations** (lines 467-475): -```bash -# SSO Identity Provider Configuration (Google and Azure AD) -GOOGLE_CLIENT_ID="${GOOGLE_CLIENT_ID:-}" -GOOGLE_CLIENT_SECRET="${GOOGLE_CLIENT_SECRET:-}" -AZURE_CLIENT_ID="${AZURE_CLIENT_ID:-}" -AZURE_CLIENT_SECRET="${AZURE_CLIENT_SECRET:-}" -AZURE_TENANT_ID="${AZURE_TENANT_ID:-common}" -``` - -2. **Google Identity Provider configuration** in Keycloak setup job: -```bash -# Configure Google Identity Provider (if credentials provided) -if [ -n "$GOOGLE_CLIENT_ID" ] && [ -n "$GOOGLE_CLIENT_SECRET" ]; then - curl -s -X POST "${KEYCLOAK_URL}/admin/realms/osmo/identity-provider/instances" \ - -H "Authorization: Bearer $TOKEN" \ - -H "Content-Type: application/json" \ - -d '{ - "alias": "google", - "displayName": "Google", - "providerId": "google", - "enabled": true, - "trustEmail": true, - "config": { - "clientId": "...", - "clientSecret": "...", - "defaultScope": "openid email profile", - "syncMode": "IMPORT" - } - }' -fi -``` - -3. **Azure AD Identity Provider configuration** in Keycloak setup job: -```bash -# Configure Azure AD (Microsoft) Identity Provider (if credentials provided) -if [ -n "$AZURE_CLIENT_ID" ] && [ -n "$AZURE_CLIENT_SECRET" ]; then - curl -s -X POST "${KEYCLOAK_URL}/admin/realms/osmo/identity-provider/instances" \ - -H "Authorization: Bearer $TOKEN" \ - -H "Content-Type: application/json" \ - -d '{ - "alias": "microsoft", - "displayName": "Microsoft", - "providerId": "microsoft", - "enabled": true, - "trustEmail": true, - "config": { - "clientId": "...", - "clientSecret": "...", - "tenant": "...", - "defaultScope": "openid email profile", - "syncMode": "IMPORT" - } - }' -fi -``` - -4. **Output messages** showing SSO status and configuration instructions. - -## Architecture - -### Authentication Flow - -1. User accesses OSMO UI at `http:///` -2. User clicks "Login" and is redirected to Keycloak -3. Keycloak login page shows options: "Login with Google" or "Login with Microsoft" -4. User selects identity provider and authenticates -5. Identity provider redirects back to Keycloak with authorization code -6. Keycloak exchanges code for tokens and creates/links user account -7. User is redirected back to OSMO UI with session established - -### Network Flow - -``` -External Traffic - │ - ▼ -┌──────────────────┐ -│ LoadBalancer │ -│ (osmo-proxy) │ -└────────┬─────────┘ - │ - ▼ -┌──────────────────────────────────────────────────┐ -│ NGINX │ -│ ┌─────────────┐ ┌─────────────┐ ┌───────────┐ │ -│ │ /api/* │ │ /realms/* │ │ /* │ │ -│ │ osmo-service│ │ keycloak │ │ osmo-ui │ │ -│ └─────────────┘ └─────────────┘ └───────────┘ │ -└──────────────────────────────────────────────────┘ -``` - -## Verification - -1. Access OSMO UI: `http:///` -2. Click "Login" -3. Verify Google and Microsoft login options appear -4. Test authentication with each provider -5. Verify user is logged in (not as "guest") - -## Troubleshooting - -### Check Keycloak Logs -```bash -kubectl logs -n osmo -l app.kubernetes.io/name=keycloak --tail=100 -``` - -### Check Identity Provider Configuration -```bash -# Port-forward to Keycloak -kubectl port-forward -n osmo svc/keycloak 8081:80 - -# Access admin console -open http://localhost:8081/admin - -# Login: admin / -kubectl get secret keycloak-admin-secret -n osmo -o jsonpath='{.data.password}' | base64 -d -``` - -### Verify Redirect URIs -In Keycloak Admin Console: -1. Go to Identity Providers → Google/Microsoft -2. Copy the "Redirect URI" shown -3. Ensure this exact URI is configured in Google Cloud Console / Azure Portal - -### Common Issues - -1. **"Invalid redirect_uri" error** - - The redirect URI in Google/Azure doesn't match Keycloak's expected URI - - Copy exact URI from Keycloak Identity Provider settings - -2. **"Login with Google/Microsoft" not showing** - - Identity provider not configured (credentials not set during deployment) - - Re-run deployment with environment variables set - -3. **User created but can't access OSMO** - - Check if user is created in Keycloak (Users section) - - Verify user has appropriate roles assigned - -## Security Considerations - -- Use HTTPS in production (configure TLS on LoadBalancer or Ingress) -- Restrict `trustEmail` if email verification is required -- Consider configuring allowed domains for Google/Azure authentication -- Rotate client secrets periodically -- Use Kubernetes secrets management (e.g., External Secrets Operator) for credentials in production diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-gpu-infrastructure.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-gpu-infrastructure.sh index 0a13bb5ae..de869a0cf 100755 --- a/applications/osmo/deploy/002-setup/cleanup/uninstall-gpu-infrastructure.sh +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-gpu-infrastructure.sh @@ -5,14 +5,7 @@ set -e -# Determine script directory (works in bash and zsh) -if [[ -n "${BASH_SOURCE[0]:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -elif [[ -n "${ZSH_VERSION:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" -else - SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -fi +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/../lib/common.sh" source "${SCRIPT_DIR}/../defaults.sh" @@ -23,8 +16,7 @@ echo "========================================" echo "" log_warning "This will remove GPU Operator, Network Operator, and KAI Scheduler" -printf "Continue? (y/N): " -read confirm +read_prompt_var "Continue? (y/N)" confirm "" if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then log_info "Cancelled" exit 0 diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-observability.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-observability.sh index 8326e2683..e847de5a6 100755 --- a/applications/osmo/deploy/002-setup/cleanup/uninstall-observability.sh +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-observability.sh @@ -5,14 +5,7 @@ set -e -# Determine script directory (works in bash and zsh) -if [[ -n "${BASH_SOURCE[0]:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -elif [[ -n "${ZSH_VERSION:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" -else - SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -fi +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/../lib/common.sh" source "${SCRIPT_DIR}/../defaults.sh" @@ -23,8 +16,35 @@ echo "========================================" echo "" log_warning "This will remove Prometheus, Grafana, and Loki" -printf "Continue? (y/N): " -read confirm +# Read input with a prompt into a variable (bash/zsh compatible). +read_prompt_var() { + local prompt=$1 + local var_name=$2 + local default=$3 + local value="" + local read_from="/dev/tty" + local write_to="/dev/tty" + + if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then + read_from="/dev/stdin" + write_to="/dev/stdout" + fi + + if [[ -n "$default" ]]; then + printf "%s [%s]: " "$prompt" "$default" >"$write_to" + else + printf "%s: " "$prompt" >"$write_to" + fi + + IFS= read -r value <"$read_from" + if [[ -z "$value" && -n "$default" ]]; then + value="$default" + fi + + eval "$var_name='$value'" +} + +read_prompt_var "Continue? (y/N)" confirm "" if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then log_info "Cancelled" exit 0 diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh index 90a66b9e4..2db7ef8da 100755 --- a/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh @@ -5,14 +5,7 @@ set -e -# Determine script directory (works in bash and zsh) -if [[ -n "${BASH_SOURCE[0]:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -elif [[ -n "${ZSH_VERSION:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" -else - SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -fi +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/../lib/common.sh" source "${SCRIPT_DIR}/../defaults.sh" @@ -23,8 +16,7 @@ echo "========================================" echo "" log_warning "This will remove OSMO Backend services" -printf "Continue? (y/N): " -read confirm +read_prompt_var "Continue? (y/N)" confirm "" if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then log_info "Cancelled" exit 0 diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-control-plane.sh index fed544c6f..0abb5f560 100755 --- a/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-control-plane.sh +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-control-plane.sh @@ -5,14 +5,7 @@ set -e -# Determine script directory (works in bash and zsh) -if [[ -n "${BASH_SOURCE[0]:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -elif [[ -n "${ZSH_VERSION:-}" ]]; then - SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" -else - SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -fi +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/../lib/common.sh" source "${SCRIPT_DIR}/../defaults.sh" @@ -23,8 +16,7 @@ echo "========================================" echo "" log_warning "This will remove OSMO Control Plane and all OSMO resources" -printf "Continue? (y/N): " -read confirm +read_prompt_var "Continue? (y/N)" confirm "" if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then log_info "Cancelled" exit 0 diff --git a/applications/osmo/deploy/002-setup/defaults.sh b/applications/osmo/deploy/002-setup/defaults.sh index 8cce2538d..14aae0071 100755 --- a/applications/osmo/deploy/002-setup/defaults.sh +++ b/applications/osmo/deploy/002-setup/defaults.sh @@ -31,14 +31,7 @@ export PROMETHEUS_RETENTION_DAYS="15" export LOKI_RETENTION_DAYS="7" export GRAFANA_ADMIN_PASSWORD="" # Auto-generated if empty -# Paths (compatible with bash and zsh) -if [[ -n "${BASH_SOURCE[0]:-}" ]]; then - export SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -elif [[ -n "${ZSH_VERSION:-}" ]]; then - # zsh - use %x prompt expansion for script path - export SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)" -else - export SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -fi +# Paths +export SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" export VALUES_DIR="${SCRIPT_DIR}/values" export LIB_DIR="${SCRIPT_DIR}/lib" diff --git a/applications/osmo/deploy/002-setup/lib/common.sh b/applications/osmo/deploy/002-setup/lib/common.sh index bc90c60c1..68800b7f0 100755 --- a/applications/osmo/deploy/002-setup/lib/common.sh +++ b/applications/osmo/deploy/002-setup/lib/common.sh @@ -27,6 +27,56 @@ log_error() { echo -e "${RED}[✗]${NC} $1" } +# Read input with a prompt into a variable (bash/zsh compatible). +read_prompt_var() { + local prompt=$1 + local var_name=$2 + local default=$3 + local value="" + local read_from="/dev/tty" + local write_to="/dev/tty" + + if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then + read_from="/dev/stdin" + write_to="/dev/stdout" + fi + + if [[ -n "$default" ]]; then + printf "%s [%s]: " "$prompt" "$default" >"$write_to" + else + printf "%s: " "$prompt" >"$write_to" + fi + + IFS= read -r value <"$read_from" + if [[ -z "$value" && -n "$default" ]]; then + value="$default" + fi + + eval "$var_name='$value'" +} + +# Read a secret value into a variable (no echo). +read_secret_var() { + local prompt=$1 + local var_name=$2 + local value="" + local read_from="/dev/tty" + local write_to="/dev/tty" + + if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then + read_from="/dev/stdin" + write_to="/dev/stdout" + fi + + printf "%s: " "$prompt" >"$write_to" + stty -echo <"$read_from" + IFS= read -r value <"$read_from" + stty echo <"$read_from" + printf "\n" >"$write_to" + + eval "$var_name='$value'" +} + # Check if command exists check_command() { command -v "$1" &>/dev/null diff --git a/applications/osmo/deploy/002-setup/nginx-proxy.yaml b/applications/osmo/deploy/002-setup/nginx-proxy.yaml index 3bf0ee743..b8eab7837 100755 --- a/applications/osmo/deploy/002-setup/nginx-proxy.yaml +++ b/applications/osmo/deploy/002-setup/nginx-proxy.yaml @@ -13,7 +13,7 @@ data: # Logging access_log /dev/stdout; error_log /dev/stderr; - + # Conditional WebSocket support # Sets Connection header to "upgrade" for WebSocket requests, "close" otherwise # This is important for proper handling of both WebSocket and regular HTTP requests @@ -21,85 +21,51 @@ data: default upgrade; '' close; } - + # Upstream servers upstream osmo-service { server osmo-service.osmo.svc.cluster.local:80; } - + upstream osmo-logger { server osmo-logger.osmo.svc.cluster.local:80; } - + upstream osmo-agent { server osmo-agent.osmo.svc.cluster.local:80; } - - upstream osmo-ui { - server osmo-ui.osmo.svc.cluster.local:80; - } - - upstream keycloak { - server keycloak.osmo.svc.cluster.local:80; - } - + server { listen 80; - + # Common proxy headers proxy_http_version 1.1; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; - + # WebSocket support (conditional based on Upgrade header) proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection $connection_upgrade; - + # Timeouts for long-running WebSocket connections (osmo-ctrl logging) proxy_read_timeout 3600s; proxy_send_timeout 3600s; - + # Route /api/logger/* to osmo-logger (WebSocket for log streaming) location /api/logger/ { proxy_pass http://osmo-logger; } - + # Route /api/agent/* to osmo-agent (WebSocket for backend communication) location /api/agent/ { proxy_pass http://osmo-agent; } - - # Route /api/* to osmo-service (REST API) - location /api/ { - proxy_pass http://osmo-service; - } - - # Route /auth/* and /realms/* to Keycloak (SSO with Google/Azure AD) - location /auth/ { - proxy_pass http://keycloak; - } - - location /realms/ { - proxy_pass http://keycloak; - } - - location /admin/ { - proxy_pass http://keycloak; - } - - location /js/ { - proxy_pass http://keycloak; - } - - location /resources/ { - proxy_pass http://keycloak; - } - - # Route everything else to osmo-ui (Web UI) + + # Everything else to osmo-service (REST API) location / { - proxy_pass http://osmo-ui; + proxy_pass http://osmo-service; } } } @@ -151,4 +117,4 @@ spec: ports: - port: 80 targetPort: 80 - type: LoadBalancer + type: ClusterIP diff --git a/applications/osmo/deploy/002-setup/values/prometheus.yaml b/applications/osmo/deploy/002-setup/values/prometheus.yaml index c97a75692..12cc634d9 100755 --- a/applications/osmo/deploy/002-setup/values/prometheus.yaml +++ b/applications/osmo/deploy/002-setup/values/prometheus.yaml @@ -4,6 +4,8 @@ # Prometheus prometheus: prometheusSpec: + # Some CRDs require this to be >= 60 + maximumStartupDurationSeconds: 60 retention: 15d resources: diff --git a/modules/gpu-operator-custom/helm.tf b/modules/gpu-operator-custom/helm.tf old mode 100644 new mode 100755 diff --git a/modules/gpu-operator-custom/variables.tf b/modules/gpu-operator-custom/variables.tf old mode 100644 new mode 100755 diff --git a/modules/nims/Readme.md b/modules/nims/Readme.md old mode 100644 new mode 100755 index 392d04e01..20f365a99 --- a/modules/nims/Readme.md +++ b/modules/nims/Readme.md @@ -18,3 +18,4 @@ This Terraform module deploys a **Kubernetes namespace** with secrets and a **Lo - Uses model cache on shared filesystem - Creates bionemo instances (on a seperate load balancer) --- + diff --git a/modules/nims/bionemo.tf b/modules/nims/bionemo.tf old mode 100644 new mode 100755 diff --git a/modules/nims/boltz2.tf b/modules/nims/boltz2.tf old mode 100644 new mode 100755 index 3c9bdba21..6c04085f4 --- a/modules/nims/boltz2.tf +++ b/modules/nims/boltz2.tf @@ -1,3 +1,4 @@ + resource "kubernetes_deployment" "boltz2" { metadata { name = "boltz2" diff --git a/modules/nims/evo2_40.tf b/modules/nims/evo2_40.tf deleted file mode 100644 index 2f4bbd261..000000000 --- a/modules/nims/evo2_40.tf +++ /dev/null @@ -1,117 +0,0 @@ -resource "kubernetes_deployment" "evo2_40b" { - metadata { - name = "evo2-40b" - namespace = var.namespace - } - - spec { - replicas = var.evo2_40b ? var.evo2_40b_replicas : 0 - - selector { - match_labels = { - app = "evo2-40b" - } - } - - template { - metadata { - labels = { - app = "evo2-40b" - lb_group = "protein-apps" - - } - } - - spec { - - image_pull_secrets { - name = kubernetes_secret.nvcrio-cred.metadata[0].name - } - # init_container { - # name = "init-mnt-data" - # image = "busybox:1.36" - # - # command = [ - # "sh", "-c", - # "mkdir -p /mnt/data/nim && chown -R 1000t:1000 /mnt/data/nim" - # ] - # - # volume_mount { - # name = "mnt-data" - # mount_path = "/mnt/data" - # } - # } - - container { - - name = "evo2-40b" - image = "nvcr.io/nim/arc/evo2-40b:${var.evo2_40b_version}" - - command = ["/bin/bash", "-c", "/opt/nim/start_server.sh"] - security_context { - run_as_user = 0 - run_as_group = 0 - } - - env { - name = "NGC_API_KEY" - - value_from { - secret_key_ref { - name = kubernetes_secret.ngc_api_key.metadata[0].name - key = "NGC_API_KEY" - } - } - } - - port { - container_port = 8000 - } - - resources { - limits = { - cpu = "16" - memory = "128Gi" - "nvidia.com/gpu" = "1" - } - - requests = { - cpu = "16" - memory = "128Gi" - "nvidia.com/gpu" = "1" - } - } - - volume_mount { - name = "dshm" - mount_path = "/dev/shm" - } - volume_mount { - name = "mnt-data" - mount_path = "/opt/nim/.cache/" - # mount_path = "/mnt/data/" - } - } - - - - volume { - name = "dshm" - - empty_dir { - medium = "Memory" - size_limit = "16Gi" - } - } - volume { - name = "mnt-data" - - host_path { - path = "/mnt/data/nim" - type = "DirectoryOrCreate" - } - } - } - } - } -} diff --git a/modules/nims/genmol.tf b/modules/nims/genmol.tf old mode 100644 new mode 100755 index 59c89ae56..1c6643a62 --- a/modules/nims/genmol.tf +++ b/modules/nims/genmol.tf @@ -1,3 +1,4 @@ + resource "kubernetes_deployment" "genmol" { metadata { name = "genmol" diff --git a/modules/nims/main.tf b/modules/nims/main.tf old mode 100644 new mode 100755 index a1b1f6770..38ab07dd0 --- a/modules/nims/main.tf +++ b/modules/nims/main.tf @@ -1,6 +1,4 @@ - resource "kubernetes_namespace" "nims" { - metadata { name = var.namespace } @@ -98,3 +96,5 @@ resource "kubernetes_service" "openfold3_lb" { } } } + + diff --git a/modules/nims/msa-search.tf b/modules/nims/msa-search.tf old mode 100644 new mode 100755 index 3b480c7b4..0acf04c04 --- a/modules/nims/msa-search.tf +++ b/modules/nims/msa-search.tf @@ -1,3 +1,4 @@ + resource "kubernetes_deployment" "msa_search" { metadata { name = "msa-search" diff --git a/modules/nims/openfold2.tf b/modules/nims/openfold2.tf old mode 100644 new mode 100755 index 1360f5039..bdda9c288 --- a/modules/nims/openfold2.tf +++ b/modules/nims/openfold2.tf @@ -1,3 +1,4 @@ + resource "kubernetes_deployment" "openfold2" { metadata { name = "openfold2" diff --git a/modules/nims/openfold3.tf b/modules/nims/openfold3.tf old mode 100644 new mode 100755 index c46c55a21..b3900d63b --- a/modules/nims/openfold3.tf +++ b/modules/nims/openfold3.tf @@ -1,3 +1,4 @@ + resource "kubernetes_deployment" "openfold3" { metadata { name = "openfold3" diff --git a/modules/nims/output.tf b/modules/nims/output.tf deleted file mode 100644 index 4d655835e..000000000 --- a/modules/nims/output.tf +++ /dev/null @@ -1,3 +0,0 @@ -output "openfold3_lb_ip" { - value = kubernetes_service.openfold3_lb.status[0].load_balancer[0].ingress[0].ip -} diff --git a/modules/nims/provider.tf b/modules/nims/provider.tf old mode 100644 new mode 100755 diff --git a/modules/nims/qwen3-next-80b-a3b-instruct.tf b/modules/nims/qwen3-next-80b-a3b-instruct.tf old mode 100644 new mode 100755 diff --git a/modules/nims/variables.tf b/modules/nims/variables.tf old mode 100644 new mode 100755 index a15ef9915..28f41bb50 --- a/modules/nims/variables.tf +++ b/modules/nims/variables.tf @@ -7,6 +7,7 @@ variable "parent_id" { variable "ngc_key" { description = "API key from Nvidia GPU cloud: catalog.ngc.nvidia.com" type = string + default = "" } variable "openfold3" { From 6ed4e5ccac264ce10730da47b66ae665faf4e82e Mon Sep 17 00:00:00 2001 From: Timothy Le Date: Fri, 6 Feb 2026 16:57:10 -0500 Subject: [PATCH 05/37] edit in setup instructions Signed-off-by: Timothy Le --- applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh | 4 ++-- .../osmo/deploy/002-setup/07-configure-gpu-platform.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh b/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh index 8a57177bf..23c8fa47b 100755 --- a/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh +++ b/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh @@ -147,8 +147,8 @@ if [[ -z "${OSMO_SERVICE_TOKEN:-}" ]]; then --service --roles osmo-backend 2>&1) # Extract token from output (format: "Access token: ") - OSMO_SERVICE_TOKEN=$(echo "$TOKEN_OUTPUT" | grep -oP 'Access token: \K.*' || echo "") - + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_OUTPUT" | sed -n 's/.*Access token: //p' | tr -d '\r' | xargs) + if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then log_error "Failed to create service token" echo "Output: $TOKEN_OUTPUT" diff --git a/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh b/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh index aa371b74f..d18f15208 100755 --- a/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh +++ b/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh @@ -123,7 +123,7 @@ fi log_success "GPU platform configuration complete" echo "" echo "To submit a GPU workflow:" -echo " osmo workflow submit workflows/osmo/gpu_test.yaml" +echo " osmo workflow submit workflows/osmo/gpu_test.yaml -p default" echo "" echo "Or test via curl:" echo " curl -X POST ${OSMO_URL}/api/workflow -H 'Content-Type: application/yaml' --data-binary @workflows/osmo/gpu_test.yaml" From 4a7280ab860cba9ddb92b90f3f419e2990d85a89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Tue, 10 Feb 2026 12:54:52 +0100 Subject: [PATCH 06/37] add multi region support --- applications/osmo/README.md | 44 ++++--- .../000-prerequisites/nebius-env-init.sh | 119 +++++++++--------- applications/osmo/deploy/001-iac/locals.tf | 44 +++++-- .../001-iac/modules/platform/variables.tf | 16 +-- .../001-iac/modules/wireguard/variables.tf | 4 +- ...aform.tfvars.cost-optimized-secure.example | 18 +-- .../terraform.tfvars.cost-optimized.example | 16 +-- .../deploy/001-iac/terraform.tfvars.example | 10 +- .../terraform.tfvars.production.example | 6 +- .../001-iac/terraform.tfvars.secure.example | 6 +- applications/osmo/deploy/001-iac/variables.tf | 21 ++-- 11 files changed, 177 insertions(+), 127 deletions(-) diff --git a/applications/osmo/README.md b/applications/osmo/README.md index 41e0e209a..645de1eb2 100755 --- a/applications/osmo/README.md +++ b/applications/osmo/README.md @@ -2,14 +2,21 @@ Deploy [NVIDIA OSMO](https://nvidia.github.io/OSMO/main/user_guide/index.html) on [Nebius AI Cloud](https://nebius.com/ai-cloud) in minutes. Run simulation, training, and edge workflows on the wide variety of Nebius GPU instances—write once in YAML, run anywhere. -## Tested in/with -- eu-north-1 +## Supported Regions + +| Region | Available GPU Platforms | +|--------|----------------------| +| `eu-north1` | gpu-h100-sxm, gpu-h200-sxm, gpu-l40s-a, gpu-l40s-d | +| `eu-north2` | gpu-h200-sxm | +| `eu-west1` | gpu-h200-sxm | +| `me-west1` | gpu-b200-sxm-a (NVIDIA B200) | +| `uk-south1` | gpu-b300-sxm (NVIDIA B300) | +| `us-central1` | gpu-h200-sxm, gpu-b200-sxm (NVIDIA B200) | ## Known Gaps and TODOs | Gap | Current Workaround | Status | |-----|-------------------|--------| -| Multi-region support | Code tested only in eu-north1; other regions have different GPU platforms (H100/H200/L40S), CPU platforms (cpu-d3 vs cpu-e2), disk types, and PostgreSQL presets | TODO | | No managed Redis service | Deploy Redis in-cluster via Helm | Workaround in place | | MysteryBox lacks K8s CSI integration | Scripts retrieve secrets and create K8s secrets manually | Workaround in place | | No External DNS service | Manual DNS configuration required | Not addressed | @@ -62,7 +69,8 @@ Production-ready infrastructure-as-code (Terraform) and setup scripts for: │ │ │ │ │ │ │ │ │ └─────────┼───► ┌──────────────┐ ┌───────────────────┐ │ │ │ │ │ │ │ CPU Nodes │ │ GPU Nodes │ │ │ │ -│ │ │ │ (cpu-d3) │ │ (L40S/H100/H200) │ │ │ │ +│ │ │ │ (cpu-d3) │ │ (L40S/H100/H200/ │ │ │ │ +│ │ │ │ │ │ B200/B300) │ │ │ │ │ │ │ │ System pods │ │ Workflow pods │ │ │ │ │ │ │ └──────────────┘ └───────────────────┘ │ │ │ │ │ │ │ │ │ @@ -138,7 +146,7 @@ This interactive script: 2. **Checks authentication** - If not authenticated, provides instructions to run `nebius profile create` 3. **Lists tenants** - Auto-detects if you have only one tenant 4. **Configures project** - Select existing project, create new one, or list available projects -5. **Sets region** - Choose between `eu-north1` (Finland) or `eu-west1` (Paris) +5. **Sets region** - Choose from `eu-north1`, `eu-north2`, `eu-west1`, `me-west1`, `uk-south1`, `us-central1` 6. **Exports environment variables** - Sets `NEBIUS_*` and `TF_VAR_*` variables for Terraform ### 3. Initialize Secrets (REQUIRED) @@ -375,16 +383,22 @@ See `deploy/001-iac/terraform.tfvars.*.example` files for all configuration opti ## GPU Options -| Platform | Preset | GPUs | VRAM | vCPUs | RAM | InfiniBand | -|----------|--------|------|------|-------|-----|------------| -| `gpu-l40s-a` | `1gpu-8vcpu-32gb` | 1 | 48GB | 8 | 32GB | No | -| `gpu-l40s-d` | `1gpu-8vcpu-32gb` | 1 | 48GB | 8 | 32GB | No | -| `gpu-h100-sxm` | `1gpu-16vcpu-200gb` | 1 | 80GB | 16 | 200GB | No | -| `gpu-h100-sxm` | `8gpu-128vcpu-1600gb` | 8 | 640GB | 128 | 1600GB | Yes | -| `gpu-h200-sxm` | `1gpu-16vcpu-200gb` | 1 | 141GB | 16 | 200GB | No | -| `gpu-h200-sxm` | `8gpu-128vcpu-1600gb` | 8 | 1128GB | 128 | 1600GB | Yes | - -**Recommendation:** Use `gpu-l40s-a` for development/testing (cheapest option). +| Platform | Preset | GPUs | vCPUs | RAM | InfiniBand | Regions | +|----------|--------|------|-------|-----|------------|---------| +| `gpu-l40s-a` | `1gpu-8vcpu-32gb` | 1 | 8 | 32GB | No | eu-north1 | +| `gpu-l40s-d` | `1gpu-8vcpu-32gb` | 1 | 8 | 32GB | No | eu-north1 | +| `gpu-h100-sxm` | `1gpu-16vcpu-200gb` | 1 | 16 | 200GB | No | eu-north1 | +| `gpu-h100-sxm` | `8gpu-128vcpu-1600gb` | 8 | 128 | 1600GB | Yes | eu-north1 | +| `gpu-h200-sxm` | `1gpu-16vcpu-200gb` | 1 | 16 | 200GB | No | eu-north1, eu-north2, eu-west1, us-central1 | +| `gpu-h200-sxm` | `8gpu-128vcpu-1600gb` | 8 | 128 | 1600GB | Yes | eu-north1, eu-north2, eu-west1, us-central1 | +| `gpu-b200-sxm` | `1gpu-20vcpu-224gb` | 1 | 20 | 224GB | No | us-central1 | +| `gpu-b200-sxm` | `8gpu-160vcpu-1792gb` | 8 | 160 | 1792GB | Yes | us-central1 | +| `gpu-b200-sxm-a` | `1gpu-20vcpu-224gb` | 1 | 20 | 224GB | No | me-west1 | +| `gpu-b200-sxm-a` | `8gpu-160vcpu-1792gb` | 8 | 160 | 1792GB | Yes | me-west1 | +| `gpu-b300-sxm` | `1gpu-24vcpu-346gb` | 1 | 24 | 346GB | No | uk-south1 | +| `gpu-b300-sxm` | `8gpu-192vcpu-2768gb` | 8 | 192 | 2768GB | Yes | uk-south1 | + +**Recommendation:** Use `gpu-l40s-a` for development/testing in eu-north1 (cheapest option). ## Required Permissions diff --git a/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh b/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh index d1d18ed6f..a9ad6a2e5 100755 --- a/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh +++ b/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh @@ -280,16 +280,16 @@ main() { echo -e "${GREEN}[✓]${NC} Nebius CLI authenticated" echo "" - # Step 3: Get current profile info - echo -e "${BLUE}Step 3: Retrieving profile information${NC}" - + # Step 3: Configure deployment settings + echo -e "${BLUE}Step 3: Configure deployment settings${NC}" + local nebius_path=$(get_nebius_path) - + # Check for existing environment variables or use defaults local current_tenant="${NEBIUS_TENANT_ID:-}" local current_project="${NEBIUS_PROJECT_ID:-}" local current_region="${NEBIUS_REGION:-eu-north1}" - + # Sanitize previously set values in case they were corrupted by a failed prompt if [[ -n "$current_tenant" && ! "$current_tenant" =~ ^tenant-[a-z0-9]+$ ]]; then current_tenant="" @@ -297,58 +297,61 @@ main() { if [[ -n "$current_project" && ! "$current_project" =~ ^project-[a-z0-9]+$ ]]; then current_project="" fi + echo "" - # Try to list tenants to help user find their tenant ID - echo "Fetching available tenants..." - local tenants=$("$nebius_path" iam tenant list --format json 2>/dev/null) - if [[ -n "$tenants" && "$tenants" != "[]" ]]; then + # Tenant ID + if [[ -z "$current_tenant" ]]; then + echo "Tenant ID is required. Find it in the Nebius Console under IAM > Tenants" echo "" - echo "Available tenants:" - if has_jq; then - local page_token="" - local total_count=0 - local last_tenant_id="" - while :; do - if [[ -n "$page_token" ]]; then - tenants=$("$nebius_path" iam tenant list --format json --page-token "$page_token" 2>/dev/null) - else - tenants=$("$nebius_path" iam tenant list --format json 2>/dev/null) - fi + read_prompt_var "List available tenants? (y/N)" list_tenants "" - echo "$tenants" | jq -r '.items // . | map(select(.metadata.name | startswith("billing-test") | not)) | .[] | " - \(.metadata.name): \(.metadata.id)"' 2>/dev/null || true - local page_count - page_count=$(echo "$tenants" | jq -r '(.items // .) | map(select(.metadata.name | startswith("billing-test") | not)) | length' 2>/dev/null || echo "0") - total_count=$((total_count + page_count)) - if [[ "$page_count" -gt 0 ]]; then - last_tenant_id=$(echo "$tenants" | jq -r '(.items // .) | map(select(.metadata.name | startswith("billing-test") | not)) | .[-1].metadata.id' 2>/dev/null) - fi + if [[ "$list_tenants" =~ ^[yY]$ ]]; then + echo "" + echo "Fetching available tenants..." + local tenants=$("$nebius_path" iam tenant list --format json 2>/dev/null) + if [[ -n "$tenants" && "$tenants" != "[]" ]]; then + echo "" + echo "Available tenants:" + if has_jq; then + local page_token="" + local total_count=0 + local last_tenant_id="" + while :; do + if [[ -n "$page_token" ]]; then + tenants=$("$nebius_path" iam tenant list --format json --page-token "$page_token" 2>/dev/null) + else + tenants=$("$nebius_path" iam tenant list --format json 2>/dev/null) + fi - page_token=$(echo "$tenants" | jq -r '.next_page_token // empty' 2>/dev/null) - if [[ -z "$page_token" ]]; then - break - fi - done + echo "$tenants" | jq -r '.items // . | map(select(.metadata.name | startswith("billing-test") | not)) | .[] | " - \(.metadata.name): \(.metadata.id)"' 2>/dev/null || true + local page_count + page_count=$(echo "$tenants" | jq -r '(.items // .) | map(select(.metadata.name | startswith("billing-test") | not)) | length' 2>/dev/null || echo "0") + total_count=$((total_count + page_count)) + if [[ "$page_count" -gt 0 ]]; then + last_tenant_id=$(echo "$tenants" | jq -r '(.items // .) | map(select(.metadata.name | startswith("billing-test") | not)) | .[-1].metadata.id' 2>/dev/null) + fi - # Auto-detect if only one tenant across all pages - if [[ "$total_count" == "1" && -z "$current_tenant" ]]; then - current_tenant="$last_tenant_id" - echo -e "${GREEN}[✓]${NC} Auto-detected tenant: $current_tenant" + page_token=$(echo "$tenants" | jq -r '.next_page_token // empty' 2>/dev/null) + if [[ -z "$page_token" ]]; then + break + fi + done + + # Auto-detect if only one tenant across all pages + if [[ "$total_count" == "1" ]]; then + current_tenant="$last_tenant_id" + echo -e "${GREEN}[✓]${NC} Auto-detected tenant: $current_tenant" + fi + else + echo " (jq not found; run 'brew install jq' to show tenants)" + fi + else + echo " No tenants found." fi - else - echo " (jq not found; run 'brew install jq' to show tenants)" + echo "" fi - fi - - echo "" - - # Step 4: Interactive configuration - echo -e "${BLUE}Step 4: Configure deployment settings${NC}" - echo "" - - # Tenant ID - if [[ -z "$current_tenant" ]]; then - echo "Tenant ID is required. Find it in the Nebius Console under IAM > Tenants" - prompt_with_default "Enter Tenant ID" "" "NEBIUS_TENANT_ID" + + prompt_with_default "Enter Tenant ID" "$current_tenant" "NEBIUS_TENANT_ID" else prompt_with_default "Tenant ID" "$current_tenant" "NEBIUS_TENANT_ID" fi @@ -410,13 +413,17 @@ main() { # Region echo "" echo "Available regions:" - echo " - eu-north1 (Finland - H100)" - echo " - eu-west1 (Paris - H200)" + echo " - eu-north1 (Finland - H100, H200, L40S)" + echo " - eu-north2 (H200)" + echo " - eu-west1 (H200)" + echo " - me-west1 (B200)" + echo " - uk-south1 (B300)" + echo " - us-central1 (H200, B200)" prompt_with_default "Region" "${current_region:-eu-north1}" "NEBIUS_REGION" - # Step 5: Export environment variables + # Step 4: Export environment variables echo "" - echo -e "${BLUE}Step 5: Setting environment variables${NC}" + echo -e "${BLUE}Step 4: Setting environment variables${NC}" export NEBIUS_TENANT_ID export NEBIUS_PROJECT_ID @@ -448,9 +455,9 @@ main() { echo " TF_VAR_parent_id = $TF_VAR_parent_id" echo " TF_VAR_region = $TF_VAR_region" - # Step 6: Verify connectivity + # Step 5: Verify connectivity echo "" - echo -e "${BLUE}Step 6: Verifying connectivity${NC}" + echo -e "${BLUE}Step 5: Verifying connectivity${NC}" if "$nebius_path" iam project get --id "$NEBIUS_PROJECT_ID" &>/dev/null; then echo -e "${GREEN}[✓]${NC} Successfully connected to Nebius project" diff --git a/applications/osmo/deploy/001-iac/locals.tf b/applications/osmo/deploy/001-iac/locals.tf index 579adee1c..8ea891a24 100755 --- a/applications/osmo/deploy/001-iac/locals.tf +++ b/applications/osmo/deploy/001-iac/locals.tf @@ -18,15 +18,25 @@ locals { gpu_nodes_preset = "8gpu-128vcpu-1600gb" infiniband_fabric = "fabric-3" } - eu-west1 = { + eu-north2 = { gpu_nodes_platform = "gpu-h200-sxm" gpu_nodes_preset = "8gpu-128vcpu-1600gb" - infiniband_fabric = "fabric-5" + infiniband_fabric = "eu-north2-a" } - eu-north2 = { + eu-west1 = { gpu_nodes_platform = "gpu-h200-sxm" gpu_nodes_preset = "8gpu-128vcpu-1600gb" - infiniband_fabric = "eu-north2-a" + infiniband_fabric = "fabric-5" + } + me-west1 = { + gpu_nodes_platform = "gpu-b200-sxm-a" + gpu_nodes_preset = "8gpu-160vcpu-1792gb" + infiniband_fabric = "me-west1-a" + } + uk-south1 = { + gpu_nodes_platform = "gpu-b300-sxm" + gpu_nodes_preset = "8gpu-192vcpu-2768gb" + infiniband_fabric = "uk-south1-a" } us-central1 = { gpu_nodes_platform = "gpu-h200-sxm" @@ -36,14 +46,28 @@ locals { } # Available GPU platforms by region (for reference) + # # eu-north1: - # - gpu-h200-sxm (H200, 141GB VRAM) - high-end - # - gpu-h100-sxm (H100, 80GB VRAM) - high-end - # - gpu-l40s-a (L40S Intel, 48GB VRAM) - cost-effective - # - gpu-l40s-d (L40S AMD, 48GB VRAM) - cost-effective + # - gpu-h100-sxm (NVIDIA H100 80GB HBM3) presets: 1gpu-16vcpu-200gb, 8gpu-128vcpu-1600gb + # - gpu-h200-sxm (NVIDIA H200) presets: 1gpu-16vcpu-200gb, 8gpu-128vcpu-1600gb + # - gpu-l40s-a (L40S Intel, 48GB VRAM) presets: 1gpu-8vcpu-32gb, 2gpu-16vcpu-64gb + # - gpu-l40s-d (L40S AMD, 48GB VRAM) presets: 1gpu-8vcpu-32gb, 2gpu-16vcpu-64gb + # + # eu-north2: + # - gpu-h200-sxm (NVIDIA H200) presets: 1gpu-16vcpu-200gb, 8gpu-128vcpu-1600gb + # + # eu-west1: + # - gpu-h200-sxm (NVIDIA H200) presets: 1gpu-16vcpu-200gb, 8gpu-128vcpu-1600gb + # + # me-west1: + # - gpu-b200-sxm-a (NVIDIA B200) presets: 1gpu-20vcpu-224gb, 8gpu-160vcpu-1792gb + # + # uk-south1: + # - gpu-b300-sxm (NVIDIA B300 SXM6 AC) presets: 1gpu-24vcpu-346gb, 8gpu-192vcpu-2768gb # - # L40S presets: 1gpu-8vcpu-32gb, 2gpu-16vcpu-64gb (verify in console) - # H100/H200 presets: 1gpu-16vcpu-200gb, 8gpu-128vcpu-1600gb + # us-central1: + # - gpu-h200-sxm (NVIDIA H200) presets: 1gpu-16vcpu-200gb, 8gpu-128vcpu-1600gb + # - gpu-b200-sxm (NVIDIA B200) presets: 1gpu-20vcpu-224gb, 8gpu-160vcpu-1792gb # Current region config with overrides current_region = local.region_defaults[var.region] diff --git a/applications/osmo/deploy/001-iac/modules/platform/variables.tf b/applications/osmo/deploy/001-iac/modules/platform/variables.tf index 8d7f801a3..de02a2bf8 100755 --- a/applications/osmo/deploy/001-iac/modules/platform/variables.tf +++ b/applications/osmo/deploy/001-iac/modules/platform/variables.tf @@ -97,13 +97,13 @@ variable "postgresql_public_access" { } variable "postgresql_platform" { - description = "PostgreSQL platform (cpu-e2 for eu-north1, cpu-d3 for eu-west1)" + description = "PostgreSQL platform (cpu-d3 available in all regions, cpu-e2 only in eu-north1)" type = string - default = "cpu-e2" - + default = "cpu-d3" + validation { condition = contains(["cpu-d3", "cpu-e2"], var.postgresql_platform) - error_message = "PostgreSQL platform must be cpu-e2 (eu-north1) or cpu-d3 (eu-west1)." + error_message = "PostgreSQL platform must be cpu-d3 (all regions) or cpu-e2 (eu-north1 only)." } } @@ -119,13 +119,13 @@ variable "postgresql_preset" { } variable "postgresql_disk_type" { - description = "PostgreSQL disk type (network-ssd for eu-north1, nbs-csi-sc for eu-west1)" + description = "PostgreSQL disk type (nbs-csi-sc for cpu-d3, network-ssd for cpu-e2/eu-north1)" type = string - default = "network-ssd" - + default = "nbs-csi-sc" + validation { condition = contains(["nbs-csi-sc", "network-ssd"], var.postgresql_disk_type) - error_message = "PostgreSQL disk type must be network-ssd (eu-north1) or nbs-csi-sc (eu-west1)." + error_message = "PostgreSQL disk type must be nbs-csi-sc (cpu-d3) or network-ssd (cpu-e2/eu-north1)." } } diff --git a/applications/osmo/deploy/001-iac/modules/wireguard/variables.tf b/applications/osmo/deploy/001-iac/modules/wireguard/variables.tf index 2d72f1d62..96d904227 100755 --- a/applications/osmo/deploy/001-iac/modules/wireguard/variables.tf +++ b/applications/osmo/deploy/001-iac/modules/wireguard/variables.tf @@ -42,9 +42,9 @@ variable "wg_network" { # ----------------------------------------------------------------------------- variable "platform" { - description = "VM platform" + description = "VM platform (cpu-d3 available in all regions, cpu-e2 only in eu-north1)" type = string - default = "cpu-e2" + default = "cpu-d3" } variable "preset" { diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized-secure.example b/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized-secure.example index d967ec61e..ec55a62ea 100755 --- a/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized-secure.example +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized-secure.example @@ -66,11 +66,13 @@ enable_gpu_cluster = false # No InfiniBand for L40S enable_gpu_taints = true gpu_nodes_preemptible = false # Set true if your project allows preemptible GPUs -# GPU options in eu-north1 (cheapest to most expensive): -# gpu-l40s-a (L40S Intel, 48GB) - ~$1.55/hr -# gpu-l40s-d (L40S AMD, 48GB) - ~$1.55/hr -# gpu-h100-sxm (H100, 80GB) - ~$4-5/hr -# gpu-h200-sxm (H200, 141GB) - ~$5-6/hr +# GPU options by region (see locals.tf for full list): +# eu-north1: gpu-l40s-a, gpu-l40s-d, gpu-h100-sxm, gpu-h200-sxm +# eu-north2: gpu-h200-sxm +# eu-west1: gpu-h200-sxm +# me-west1: gpu-b200-sxm-a (NVIDIA B200) +# uk-south1: gpu-b300-sxm (NVIDIA B300) +# us-central1: gpu-h200-sxm, gpu-b200-sxm (NVIDIA B200) # ----------------------------------------------------------------------------- # Storage (minimal) @@ -81,9 +83,9 @@ filestore_size_gib = 256 # ----------------------------------------------------------------------------- # PostgreSQL (Nebius Managed Service - minimal) # ----------------------------------------------------------------------------- -postgresql_platform = "cpu-e2" # cpu-e2 for eu-north1 +postgresql_platform = "cpu-d3" # cpu-d3 (all regions) or cpu-e2 (eu-north1 only) postgresql_preset = "2vcpu-8gb" # Minimum preset (cheapest) -postgresql_disk_type = "network-ssd" # Required for eu-north1 +postgresql_disk_type = "nbs-csi-sc" # nbs-csi-sc (cpu-d3) or network-ssd (cpu-e2/eu-north1) postgresql_disk_size_gib = 20 postgresql_host_count = 1 @@ -91,7 +93,7 @@ postgresql_host_count = 1 # WireGuard VPN (ENABLED for secure access) # ----------------------------------------------------------------------------- enable_wireguard = true -wireguard_platform = "cpu-e2" +wireguard_platform = "cpu-d3" wireguard_preset = "2vcpu-8gb" # Smallest for VPN wireguard_disk_size_gib = 32 wireguard_port = 51820 diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example b/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example index 4e5fbba56..490e88b36 100755 --- a/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example @@ -59,11 +59,13 @@ enable_gpu_cluster = false # No InfiniBand for L40S enable_gpu_taints = true gpu_nodes_preemptible = false # Set true if your project allows preemptible GPUs -# Other GPU options in eu-north1: -# gpu-l40s-a (L40S Intel, 48GB) - cheapest ~$1.55/hr -# gpu-l40s-d (L40S AMD, 48GB) - cheap ~$1.55/hr -# gpu-h100-sxm (H100, 80GB) - ~$4-5/hr -# gpu-h200-sxm (H200, 141GB) - most expensive +# GPU options by region (see locals.tf for full list): +# eu-north1: gpu-l40s-a, gpu-l40s-d, gpu-h100-sxm, gpu-h200-sxm +# eu-north2: gpu-h200-sxm +# eu-west1: gpu-h200-sxm +# me-west1: gpu-b200-sxm-a (NVIDIA B200) +# uk-south1: gpu-b300-sxm (NVIDIA B300) +# us-central1: gpu-h200-sxm, gpu-b200-sxm (NVIDIA B200) # ----------------------------------------------------------------------------- # Storage (minimal) @@ -74,9 +76,9 @@ filestore_size_gib = 256 # Smaller filestore # ----------------------------------------------------------------------------- # PostgreSQL (Nebius Managed Service - minimal) # ----------------------------------------------------------------------------- -postgresql_platform = "cpu-e2" # cpu-e2 for eu-north1 +postgresql_platform = "cpu-d3" # cpu-d3 (all regions) or cpu-e2 (eu-north1 only) postgresql_preset = "2vcpu-8gb" # Minimum preset (cheapest) -postgresql_disk_type = "network-ssd" # Required for eu-north1 +postgresql_disk_type = "nbs-csi-sc" # nbs-csi-sc (cpu-d3) or network-ssd (cpu-e2/eu-north1) postgresql_disk_size_gib = 20 postgresql_host_count = 1 diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.example b/applications/osmo/deploy/001-iac/terraform.tfvars.example index bfbc36dba..848e1a68d 100755 --- a/applications/osmo/deploy/001-iac/terraform.tfvars.example +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.example @@ -15,7 +15,7 @@ parent_id = "your-project-id" # From NEBIUS_PROJECT_ID # ----------------------------------------------------------------------------- # Environment Settings # ----------------------------------------------------------------------------- -region = "eu-north1" # or eu-west1 +region = "eu-north1" # eu-north1, eu-north2, eu-west1, me-west1, uk-south1, us-central1 environment = "dev" # dev, staging, prod project_name = "osmo" # Used for resource naming @@ -40,7 +40,7 @@ cpu_nodes_assign_public_ip = false # Private nodes only # ----------------------------------------------------------------------------- gpu_nodes_count_per_group = 1 gpu_node_groups = 1 -gpu_nodes_platform = "gpu-h100-sxm" # or gpu-h200-sxm +gpu_nodes_platform = "gpu-h100-sxm" # See locals.tf for all GPU platforms per region gpu_nodes_preset = "8gpu-128vcpu-1600gb" gpu_disk_size_gib = 1023 gpu_nodes_assign_public_ip = false # Private nodes only @@ -57,10 +57,10 @@ filestore_size_gib = 1024 # ----------------------------------------------------------------------------- # PostgreSQL (Nebius Managed Service) # ----------------------------------------------------------------------------- -# Platform depends on region: cpu-d3 (eu-west1), cpu-e2 (eu-north1) -postgresql_platform = "cpu-e2" # Adjust for your region +# Platform depends on region: cpu-e2 (eu-north1), cpu-d3 (all other regions) +postgresql_platform = "cpu-d3" # cpu-d3 (all regions) or cpu-e2 (eu-north1 only) postgresql_preset = "2vcpu-8gb" # 2/4/8/16 vcpu options available -postgresql_disk_type = "network-ssd" # network-ssd (eu-north1), nbs-csi-sc (eu-west1) +postgresql_disk_type = "nbs-csi-sc" # nbs-csi-sc (cpu-d3) or network-ssd (cpu-e2/eu-north1) postgresql_disk_size_gib = 50 postgresql_host_count = 1 diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.production.example b/applications/osmo/deploy/001-iac/terraform.tfvars.production.example index 9bda9b0fd..7ea2a726d 100755 --- a/applications/osmo/deploy/001-iac/terraform.tfvars.production.example +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.production.example @@ -67,9 +67,9 @@ filestore_block_size_kib = 4 # ----------------------------------------------------------------------------- # PostgreSQL (Nebius Managed Service - HA) # ----------------------------------------------------------------------------- -postgresql_platform = "cpu-e2" # cpu-e2 for eu-north1 +postgresql_platform = "cpu-d3" # cpu-d3 (all regions) or cpu-e2 (eu-north1 only) postgresql_preset = "16vcpu-64gb" # Production size -postgresql_disk_type = "network-ssd" # Required for eu-north1 +postgresql_disk_type = "nbs-csi-sc" # nbs-csi-sc (cpu-d3) or network-ssd (cpu-e2/eu-north1) postgresql_disk_size_gib = 100 postgresql_host_count = 3 # HA with replicas @@ -77,7 +77,7 @@ postgresql_host_count = 3 # HA with replicas # WireGuard VPN (enabled for secure access) # ----------------------------------------------------------------------------- enable_wireguard = true -wireguard_platform = "cpu-e2" +wireguard_platform = "cpu-d3" wireguard_preset = "2vcpu-8gb" wireguard_disk_size_gib = 64 wireguard_port = 51820 diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example b/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example index e8a98c8f1..c6826199d 100755 --- a/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example @@ -62,9 +62,9 @@ filestore_size_gib = 1024 # ----------------------------------------------------------------------------- # PostgreSQL (Nebius Managed Service) # ----------------------------------------------------------------------------- -postgresql_platform = "cpu-e2" # cpu-e2 for eu-north1 +postgresql_platform = "cpu-d3" # cpu-d3 (all regions) or cpu-e2 (eu-north1 only) postgresql_preset = "2vcpu-8gb" # Minimum preset -postgresql_disk_type = "network-ssd" # Required for eu-north1 +postgresql_disk_type = "nbs-csi-sc" # nbs-csi-sc (cpu-d3) or network-ssd (cpu-e2/eu-north1) postgresql_disk_size_gib = 50 postgresql_host_count = 1 @@ -72,7 +72,7 @@ postgresql_host_count = 1 # WireGuard VPN (REQUIRED for this config) # ----------------------------------------------------------------------------- enable_wireguard = true -wireguard_platform = "cpu-e2" +wireguard_platform = "cpu-d3" wireguard_preset = "2vcpu-8gb" wireguard_disk_size_gib = 64 wireguard_port = 51820 diff --git a/applications/osmo/deploy/001-iac/variables.tf b/applications/osmo/deploy/001-iac/variables.tf index 576623a3b..7f526b9bf 100755 --- a/applications/osmo/deploy/001-iac/variables.tf +++ b/applications/osmo/deploy/001-iac/variables.tf @@ -18,8 +18,8 @@ variable "region" { default = "eu-north1" validation { - condition = contains(["eu-north1", "eu-west1", "eu-north2", "us-central1"], var.region) - error_message = "Region must be one of: eu-north1, eu-west1, eu-north2, us-central1" + condition = contains(["eu-north1", "eu-north2", "eu-west1", "me-west1", "uk-south1", "us-central1"], var.region) + error_message = "Region must be one of: eu-north1, eu-north2, eu-west1, me-west1, uk-south1, us-central1" } } @@ -262,9 +262,10 @@ variable "storage_bucket_name" { # ============================================================================= # PostgreSQL Configuration # Region-specific options: -# eu-west1: platform=cpu-d3, disk=nbs-csi-sc # eu-north1: platform=cpu-e2, disk=network-ssd -# Presets (both regions): 2vcpu-8gb, 4vcpu-16gb, 8vcpu-32gb, 16vcpu-64gb +# All other regions (eu-north2, eu-west1, me-west1, uk-south1, us-central1): +# platform=cpu-d3, disk=nbs-csi-sc +# Presets: 2vcpu-8gb, 4vcpu-16gb, 8vcpu-32gb, 16vcpu-64gb # ============================================================================= variable "enable_managed_postgresql" { @@ -291,9 +292,9 @@ variable "postgresql_public_access" { } variable "postgresql_platform" { - description = "PostgreSQL platform (cpu-e2 for eu-north1, cpu-d3 for eu-west1)" + description = "PostgreSQL platform (cpu-d3 available in all regions, cpu-e2 only in eu-north1)" type = string - default = "cpu-e2" + default = "cpu-d3" } variable "postgresql_preset" { @@ -303,9 +304,9 @@ variable "postgresql_preset" { } variable "postgresql_disk_type" { - description = "PostgreSQL disk type (network-ssd for eu-north1, nbs-csi-sc for eu-west1)" + description = "PostgreSQL disk type (nbs-csi-sc for cpu-d3, network-ssd for cpu-e2/eu-north1)" type = string - default = "network-ssd" + default = "nbs-csi-sc" } variable "postgresql_disk_size_gib" { @@ -401,9 +402,9 @@ variable "enable_wireguard" { } variable "wireguard_platform" { - description = "Platform for WireGuard instance" + description = "Platform for WireGuard instance (cpu-d3 available in all regions, cpu-e2 only in eu-north1)" type = string - default = "cpu-e2" + default = "cpu-d3" } variable "wireguard_preset" { From 904d9c09f6d4684468662d281f930b8eae617569 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Tue, 10 Feb 2026 15:11:47 +0100 Subject: [PATCH 07/37] add multi region support --- applications/osmo/deploy/001-iac/locals.tf | 51 ++++++++++++------- applications/osmo/deploy/001-iac/main.tf | 4 +- .../001-iac/modules/platform/variables.tf | 16 +++--- ...aform.tfvars.cost-optimized-secure.example | 4 +- .../terraform.tfvars.cost-optimized.example | 4 +- .../deploy/001-iac/terraform.tfvars.example | 6 +-- .../terraform.tfvars.production.example | 6 +-- .../001-iac/terraform.tfvars.secure.example | 4 +- applications/osmo/deploy/001-iac/variables.tf | 15 +++--- 9 files changed, 62 insertions(+), 48 deletions(-) diff --git a/applications/osmo/deploy/001-iac/locals.tf b/applications/osmo/deploy/001-iac/locals.tf index 8ea891a24..f20d6c140 100755 --- a/applications/osmo/deploy/001-iac/locals.tf +++ b/applications/osmo/deploy/001-iac/locals.tf @@ -14,34 +14,46 @@ locals { # Region-specific defaults region_defaults = { eu-north1 = { - gpu_nodes_platform = "gpu-h100-sxm" - gpu_nodes_preset = "8gpu-128vcpu-1600gb" - infiniband_fabric = "fabric-3" + gpu_nodes_platform = "gpu-h100-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "fabric-3" + postgresql_platform = "cpu-e2" + postgresql_disk_type = "network-ssd" } eu-north2 = { - gpu_nodes_platform = "gpu-h200-sxm" - gpu_nodes_preset = "8gpu-128vcpu-1600gb" - infiniband_fabric = "eu-north2-a" + gpu_nodes_platform = "gpu-h200-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "eu-north2-a" + postgresql_platform = "cpu-d3" + postgresql_disk_type = "network-ssd" } eu-west1 = { - gpu_nodes_platform = "gpu-h200-sxm" - gpu_nodes_preset = "8gpu-128vcpu-1600gb" - infiniband_fabric = "fabric-5" + gpu_nodes_platform = "gpu-h200-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "fabric-5" + postgresql_platform = "cpu-d3" + postgresql_disk_type = "network-ssd" } me-west1 = { - gpu_nodes_platform = "gpu-b200-sxm-a" - gpu_nodes_preset = "8gpu-160vcpu-1792gb" - infiniband_fabric = "me-west1-a" + gpu_nodes_platform = "gpu-b200-sxm-a" + gpu_nodes_preset = "8gpu-160vcpu-1792gb" + infiniband_fabric = "me-west1-a" + postgresql_platform = "cpu-d3" + postgresql_disk_type = "network-ssd" } uk-south1 = { - gpu_nodes_platform = "gpu-b300-sxm" - gpu_nodes_preset = "8gpu-192vcpu-2768gb" - infiniband_fabric = "uk-south1-a" + gpu_nodes_platform = "gpu-b300-sxm" + gpu_nodes_preset = "8gpu-192vcpu-2768gb" + infiniband_fabric = "uk-south1-a" + postgresql_platform = "cpu-d3" + postgresql_disk_type = "network-ssd" } us-central1 = { - gpu_nodes_platform = "gpu-h200-sxm" - gpu_nodes_preset = "8gpu-128vcpu-1600gb" - infiniband_fabric = "us-central1-a" + gpu_nodes_platform = "gpu-h200-sxm" + gpu_nodes_preset = "8gpu-128vcpu-1600gb" + infiniband_fabric = "us-central1-a" + postgresql_platform = "cpu-d3" + postgresql_disk_type = "network-ssd" } } @@ -76,6 +88,9 @@ locals { gpu_nodes_preset = coalesce(var.gpu_nodes_preset, local.current_region.gpu_nodes_preset) infiniband_fabric = coalesce(var.infiniband_fabric, local.current_region.infiniband_fabric) + postgresql_platform = coalesce(var.postgresql_platform, local.current_region.postgresql_platform) + postgresql_disk_type = coalesce(var.postgresql_disk_type, local.current_region.postgresql_disk_type) + # Generate unique storage bucket name if not provided storage_bucket_name = var.storage_bucket_name != "" ? var.storage_bucket_name : "${local.name_prefix}-storage-${random_string.suffix.result}" diff --git a/applications/osmo/deploy/001-iac/main.tf b/applications/osmo/deploy/001-iac/main.tf index af91f9e0c..137aa31a2 100755 --- a/applications/osmo/deploy/001-iac/main.tf +++ b/applications/osmo/deploy/001-iac/main.tf @@ -29,9 +29,9 @@ module "platform" { enable_managed_postgresql = var.enable_managed_postgresql postgresql_version = var.postgresql_version postgresql_public_access = var.postgresql_public_access - postgresql_platform = var.postgresql_platform + postgresql_platform = local.postgresql_platform postgresql_preset = var.postgresql_preset - postgresql_disk_type = var.postgresql_disk_type + postgresql_disk_type = local.postgresql_disk_type postgresql_disk_size_gib = var.postgresql_disk_size_gib postgresql_host_count = var.postgresql_host_count postgresql_database_name = var.postgresql_database_name diff --git a/applications/osmo/deploy/001-iac/modules/platform/variables.tf b/applications/osmo/deploy/001-iac/modules/platform/variables.tf index de02a2bf8..444aea81d 100755 --- a/applications/osmo/deploy/001-iac/modules/platform/variables.tf +++ b/applications/osmo/deploy/001-iac/modules/platform/variables.tf @@ -97,13 +97,13 @@ variable "postgresql_public_access" { } variable "postgresql_platform" { - description = "PostgreSQL platform (cpu-d3 available in all regions, cpu-e2 only in eu-north1)" + description = "PostgreSQL platform (cpu-e2 for managed PostgreSQL in all regions)" type = string - default = "cpu-d3" + default = "cpu-e2" validation { condition = contains(["cpu-d3", "cpu-e2"], var.postgresql_platform) - error_message = "PostgreSQL platform must be cpu-d3 (all regions) or cpu-e2 (eu-north1 only)." + error_message = "PostgreSQL platform must be cpu-e2 (recommended) or cpu-d3." } } @@ -113,19 +113,19 @@ variable "postgresql_preset" { default = "2vcpu-8gb" validation { - condition = contains(["2vcpu-8gb", "4vcpu-16gb", "8vcpu-32gb", "16vcpu-64gb"], var.postgresql_preset) - error_message = "PostgreSQL preset must be 2vcpu-8gb, 4vcpu-16gb, 8vcpu-32gb, or 16vcpu-64gb." + condition = contains(["2vcpu-8gb", "4vcpu-16gb", "8vcpu-32gb"], var.postgresql_preset) + error_message = "PostgreSQL preset must be 2vcpu-8gb, 4vcpu-16gb, or 8vcpu-32gb." } } variable "postgresql_disk_type" { - description = "PostgreSQL disk type (nbs-csi-sc for cpu-d3, network-ssd for cpu-e2/eu-north1)" + description = "PostgreSQL disk type (network-ssd for managed PostgreSQL in all regions)" type = string - default = "nbs-csi-sc" + default = "network-ssd" validation { condition = contains(["nbs-csi-sc", "network-ssd"], var.postgresql_disk_type) - error_message = "PostgreSQL disk type must be nbs-csi-sc (cpu-d3) or network-ssd (cpu-e2/eu-north1)." + error_message = "PostgreSQL disk type must be network-ssd (recommended) or nbs-csi-sc." } } diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized-secure.example b/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized-secure.example index ec55a62ea..408c070c2 100755 --- a/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized-secure.example +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized-secure.example @@ -83,9 +83,9 @@ filestore_size_gib = 256 # ----------------------------------------------------------------------------- # PostgreSQL (Nebius Managed Service - minimal) # ----------------------------------------------------------------------------- -postgresql_platform = "cpu-d3" # cpu-d3 (all regions) or cpu-e2 (eu-north1 only) +# postgresql_platform = null # Auto: cpu-e2 (eu-north1), cpu-d3 (other regions) postgresql_preset = "2vcpu-8gb" # Minimum preset (cheapest) -postgresql_disk_type = "nbs-csi-sc" # nbs-csi-sc (cpu-d3) or network-ssd (cpu-e2/eu-north1) +# postgresql_disk_type = null # Auto: network-ssd (eu-north1), nbs-csi-sc (other regions) postgresql_disk_size_gib = 20 postgresql_host_count = 1 diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example b/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example index 490e88b36..55666e49e 100755 --- a/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example @@ -76,9 +76,9 @@ filestore_size_gib = 256 # Smaller filestore # ----------------------------------------------------------------------------- # PostgreSQL (Nebius Managed Service - minimal) # ----------------------------------------------------------------------------- -postgresql_platform = "cpu-d3" # cpu-d3 (all regions) or cpu-e2 (eu-north1 only) +# postgresql_platform = null # Auto: cpu-e2 (eu-north1), cpu-d3 (other regions) postgresql_preset = "2vcpu-8gb" # Minimum preset (cheapest) -postgresql_disk_type = "nbs-csi-sc" # nbs-csi-sc (cpu-d3) or network-ssd (cpu-e2/eu-north1) +# postgresql_disk_type = null # Auto: network-ssd (eu-north1), nbs-csi-sc (other regions) postgresql_disk_size_gib = 20 postgresql_host_count = 1 diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.example b/applications/osmo/deploy/001-iac/terraform.tfvars.example index 848e1a68d..88a28b19b 100755 --- a/applications/osmo/deploy/001-iac/terraform.tfvars.example +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.example @@ -58,9 +58,9 @@ filestore_size_gib = 1024 # PostgreSQL (Nebius Managed Service) # ----------------------------------------------------------------------------- # Platform depends on region: cpu-e2 (eu-north1), cpu-d3 (all other regions) -postgresql_platform = "cpu-d3" # cpu-d3 (all regions) or cpu-e2 (eu-north1 only) -postgresql_preset = "2vcpu-8gb" # 2/4/8/16 vcpu options available -postgresql_disk_type = "nbs-csi-sc" # nbs-csi-sc (cpu-d3) or network-ssd (cpu-e2/eu-north1) +# postgresql_platform = null # Auto: cpu-e2 (eu-north1), cpu-d3 (other regions) +postgresql_preset = "2vcpu-8gb" # Available presets vary by region +# postgresql_disk_type = null # Auto: network-ssd (eu-north1), nbs-csi-sc (other regions) postgresql_disk_size_gib = 50 postgresql_host_count = 1 diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.production.example b/applications/osmo/deploy/001-iac/terraform.tfvars.production.example index 7ea2a726d..95745db85 100755 --- a/applications/osmo/deploy/001-iac/terraform.tfvars.production.example +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.production.example @@ -67,9 +67,9 @@ filestore_block_size_kib = 4 # ----------------------------------------------------------------------------- # PostgreSQL (Nebius Managed Service - HA) # ----------------------------------------------------------------------------- -postgresql_platform = "cpu-d3" # cpu-d3 (all regions) or cpu-e2 (eu-north1 only) -postgresql_preset = "16vcpu-64gb" # Production size -postgresql_disk_type = "nbs-csi-sc" # nbs-csi-sc (cpu-d3) or network-ssd (cpu-e2/eu-north1) +# postgresql_platform = null # Auto: cpu-e2 (eu-north1), cpu-d3 (other regions) +postgresql_preset = "4vcpu-16gb" # Production size (available in all regions) +# postgresql_disk_type = null # Auto: network-ssd (eu-north1), nbs-csi-sc (other regions) postgresql_disk_size_gib = 100 postgresql_host_count = 3 # HA with replicas diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example b/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example index c6826199d..084b217cd 100755 --- a/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example @@ -62,9 +62,9 @@ filestore_size_gib = 1024 # ----------------------------------------------------------------------------- # PostgreSQL (Nebius Managed Service) # ----------------------------------------------------------------------------- -postgresql_platform = "cpu-d3" # cpu-d3 (all regions) or cpu-e2 (eu-north1 only) +# postgresql_platform = null # Auto: cpu-e2 (eu-north1), cpu-d3 (other regions) postgresql_preset = "2vcpu-8gb" # Minimum preset -postgresql_disk_type = "nbs-csi-sc" # nbs-csi-sc (cpu-d3) or network-ssd (cpu-e2/eu-north1) +# postgresql_disk_type = null # Auto: network-ssd (eu-north1), nbs-csi-sc (other regions) postgresql_disk_size_gib = 50 postgresql_host_count = 1 diff --git a/applications/osmo/deploy/001-iac/variables.tf b/applications/osmo/deploy/001-iac/variables.tf index 7f526b9bf..a184adb3c 100755 --- a/applications/osmo/deploy/001-iac/variables.tf +++ b/applications/osmo/deploy/001-iac/variables.tf @@ -261,11 +261,10 @@ variable "storage_bucket_name" { # ============================================================================= # PostgreSQL Configuration -# Region-specific options: +# Region-specific defaults (auto-selected when set to null): # eu-north1: platform=cpu-e2, disk=network-ssd -# All other regions (eu-north2, eu-west1, me-west1, uk-south1, us-central1): -# platform=cpu-d3, disk=nbs-csi-sc -# Presets: 2vcpu-8gb, 4vcpu-16gb, 8vcpu-32gb, 16vcpu-64gb +# All other regions: platform=cpu-d3, disk=network-ssd +# Safe preset across all regions: 2vcpu-8gb or 4vcpu-16gb # ============================================================================= variable "enable_managed_postgresql" { @@ -292,9 +291,9 @@ variable "postgresql_public_access" { } variable "postgresql_platform" { - description = "PostgreSQL platform (cpu-d3 available in all regions, cpu-e2 only in eu-north1)" + description = "PostgreSQL platform (null for region default: cpu-e2 in eu-north1, cpu-d3 elsewhere)" type = string - default = "cpu-d3" + default = null } variable "postgresql_preset" { @@ -304,9 +303,9 @@ variable "postgresql_preset" { } variable "postgresql_disk_type" { - description = "PostgreSQL disk type (nbs-csi-sc for cpu-d3, network-ssd for cpu-e2/eu-north1)" + description = "PostgreSQL disk type (null for region default: network-ssd in eu-north1, nbs-csi-sc elsewhere)" type = string - default = "nbs-csi-sc" + default = null } variable "postgresql_disk_size_gib" { From 3f91352a821f11d16c643164dd389fb3eaf910b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Tue, 10 Feb 2026 20:11:34 +0100 Subject: [PATCH 08/37] add load balancer to ui --- .../osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh index 117108cd9..c642c3014 100755 --- a/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh +++ b/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh @@ -1126,7 +1126,7 @@ if [[ "${DEPLOY_UI:-true}" == "true" ]]; then helm upgrade --install osmo-ui osmo/web-ui \ --namespace "${OSMO_NAMESPACE}" \ - --set service.type=ClusterIP \ + --set services.ui.service.type=LoadBalancer \ --set global.domain=osmo.local \ --set services.ui.ingress.enabled=false \ --set services.ui.replicas=1 \ From 752b541addebad86aa19b8b53fcd1f47ba9c3a2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Wed, 11 Feb 2026 11:16:15 +0100 Subject: [PATCH 09/37] - gpu operator optional - add network operator for infiniband --- applications/osmo/deploy/001-iac/locals.tf | 8 +++++ applications/osmo/deploy/001-iac/main.tf | 2 ++ .../osmo/deploy/001-iac/modules/k8s/main.tf | 3 ++ .../deploy/001-iac/modules/k8s/variables.tf | 12 +++++++ ...aform.tfvars.cost-optimized-secure.example | 1 + .../terraform.tfvars.cost-optimized.example | 1 + .../deploy/001-iac/terraform.tfvars.example | 1 + .../terraform.tfvars.production.example | 1 + .../001-iac/terraform.tfvars.secure.example | 1 + applications/osmo/deploy/001-iac/variables.tf | 6 ++++ .../002-setup/01-deploy-gpu-infrastructure.sh | 33 ++++++++++++++----- .../osmo/deploy/002-setup/defaults.sh | 6 +++- .../002-setup/values/network-operator.yaml | 7 +--- 13 files changed, 67 insertions(+), 15 deletions(-) diff --git a/applications/osmo/deploy/001-iac/locals.tf b/applications/osmo/deploy/001-iac/locals.tf index f20d6c140..f1b9e0dd6 100755 --- a/applications/osmo/deploy/001-iac/locals.tf +++ b/applications/osmo/deploy/001-iac/locals.tf @@ -91,6 +91,14 @@ locals { postgresql_platform = coalesce(var.postgresql_platform, local.current_region.postgresql_platform) postgresql_disk_type = coalesce(var.postgresql_disk_type, local.current_region.postgresql_disk_type) + # Driverfull image: map GPU platform to CUDA driver preset + platform_to_cuda = { + gpu-b200-sxm-a = "cuda12.8" + gpu-b200-sxm = "cuda12.8" + gpu-b300-sxm = "cuda13.0" + } + gpu_drivers_preset = lookup(local.platform_to_cuda, local.gpu_nodes_platform, "cuda12") + # Generate unique storage bucket name if not provided storage_bucket_name = var.storage_bucket_name != "" ? var.storage_bucket_name : "${local.name_prefix}-storage-${random_string.suffix.result}" diff --git a/applications/osmo/deploy/001-iac/main.tf b/applications/osmo/deploy/001-iac/main.tf index 137aa31a2..ae471c25d 100755 --- a/applications/osmo/deploy/001-iac/main.tf +++ b/applications/osmo/deploy/001-iac/main.tf @@ -89,6 +89,8 @@ module "k8s" { infiniband_fabric = local.infiniband_fabric enable_gpu_taints = var.enable_gpu_taints gpu_nodes_preemptible = var.gpu_nodes_preemptible + gpu_nodes_driverfull_image = var.gpu_nodes_driverfull_image + gpu_drivers_preset = local.gpu_drivers_preset # Filestore enable_filestore = var.enable_filestore diff --git a/applications/osmo/deploy/001-iac/modules/k8s/main.tf b/applications/osmo/deploy/001-iac/modules/k8s/main.tf index b5d6bc07b..e913c0d87 100755 --- a/applications/osmo/deploy/001-iac/modules/k8s/main.tf +++ b/applications/osmo/deploy/001-iac/modules/k8s/main.tf @@ -142,6 +142,9 @@ resource "nebius_mk8s_v1_node_group" "gpu" { # GPU cluster for InfiniBand gpu_cluster = var.enable_gpu_cluster ? nebius_compute_v1_gpu_cluster.main[0] : null + # Driverfull images (pre-installed NVIDIA drivers, no GPU Operator driver needed) + gpu_settings = var.gpu_nodes_driverfull_image ? { drivers_preset = var.gpu_drivers_preset } : null + # Preemptible configuration preemptible = var.gpu_nodes_preemptible ? { on_preemption = "STOP" diff --git a/applications/osmo/deploy/001-iac/modules/k8s/variables.tf b/applications/osmo/deploy/001-iac/modules/k8s/variables.tf index e01b7eaac..b2662db58 100755 --- a/applications/osmo/deploy/001-iac/modules/k8s/variables.tf +++ b/applications/osmo/deploy/001-iac/modules/k8s/variables.tf @@ -175,6 +175,18 @@ variable "gpu_nodes_preemptible" { default = false } +variable "gpu_nodes_driverfull_image" { + description = "Use Nebius driverfull images with pre-installed NVIDIA drivers" + type = bool + default = false +} + +variable "gpu_drivers_preset" { + description = "CUDA driver preset for driverfull images (e.g. cuda12, cuda12.8, cuda13.0)" + type = string + default = "cuda12" +} + # ----------------------------------------------------------------------------- # Filestore Configuration # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized-secure.example b/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized-secure.example index 408c070c2..8d4c88f1f 100755 --- a/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized-secure.example +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized-secure.example @@ -65,6 +65,7 @@ gpu_nodes_assign_public_ip = false # Private only enable_gpu_cluster = false # No InfiniBand for L40S enable_gpu_taints = true gpu_nodes_preemptible = false # Set true if your project allows preemptible GPUs +gpu_nodes_driverfull_image = false # Set true for Nebius pre-installed drivers (recommended for B200/B300) # GPU options by region (see locals.tf for full list): # eu-north1: gpu-l40s-a, gpu-l40s-d, gpu-h100-sxm, gpu-h200-sxm diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example b/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example index 55666e49e..ca8d54e5c 100755 --- a/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example @@ -58,6 +58,7 @@ gpu_nodes_assign_public_ip = false enable_gpu_cluster = false # No InfiniBand for L40S enable_gpu_taints = true gpu_nodes_preemptible = false # Set true if your project allows preemptible GPUs +gpu_nodes_driverfull_image = false # Set true for Nebius pre-installed drivers (recommended for B200/B300) # GPU options by region (see locals.tf for full list): # eu-north1: gpu-l40s-a, gpu-l40s-d, gpu-h100-sxm, gpu-h200-sxm diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.example b/applications/osmo/deploy/001-iac/terraform.tfvars.example index 88a28b19b..89e7b63ea 100755 --- a/applications/osmo/deploy/001-iac/terraform.tfvars.example +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.example @@ -47,6 +47,7 @@ gpu_nodes_assign_public_ip = false # Private nodes only enable_gpu_cluster = true # InfiniBand enable_gpu_taints = true gpu_nodes_preemptible = false # Preemptible requires project permissions +gpu_nodes_driverfull_image = false # Set true for Nebius pre-installed drivers (recommended for B200/B300) # ----------------------------------------------------------------------------- # Storage diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.production.example b/applications/osmo/deploy/001-iac/terraform.tfvars.production.example index 95745db85..778da4f99 100755 --- a/applications/osmo/deploy/001-iac/terraform.tfvars.production.example +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.production.example @@ -55,6 +55,7 @@ enable_gpu_cluster = true # InfiniBand enabled infiniband_fabric = null # Use region default enable_gpu_taints = true gpu_nodes_preemptible = false # Preemptible requires project permissions +gpu_nodes_driverfull_image = true # Nebius pre-installed drivers (recommended for B200/B300) # ----------------------------------------------------------------------------- # Storage (production grade) diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example b/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example index 084b217cd..f74be9ea1 100755 --- a/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example +++ b/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example @@ -52,6 +52,7 @@ gpu_nodes_assign_public_ip = false # Private only enable_gpu_cluster = true enable_gpu_taints = true gpu_nodes_preemptible = false # Preemptible requires project permissions +gpu_nodes_driverfull_image = false # Set true for Nebius pre-installed drivers (recommended for B200/B300) # ----------------------------------------------------------------------------- # Storage diff --git a/applications/osmo/deploy/001-iac/variables.tf b/applications/osmo/deploy/001-iac/variables.tf index a184adb3c..39db0992a 100755 --- a/applications/osmo/deploy/001-iac/variables.tf +++ b/applications/osmo/deploy/001-iac/variables.tf @@ -221,6 +221,12 @@ variable "gpu_nodes_preemptible" { default = false } +variable "gpu_nodes_driverfull_image" { + description = "Use Nebius driverfull images (pre-installed NVIDIA drivers). When true, GPU Operator driver installation is not needed." + type = bool + default = false +} + # ============================================================================= # Filestore Configuration # ============================================================================= diff --git a/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh b/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh index 58e6f1241..4e110cae4 100755 --- a/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh +++ b/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh @@ -25,18 +25,35 @@ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --force-update helm repo update # ----------------------------------------------------------------------------- -# Deploy GPU Operator +# Deploy GPU Operator (skipped when using driverfull images) # ----------------------------------------------------------------------------- -log_info "Deploying NVIDIA GPU Operator..." +if [[ "${USE_DRIVERFULL_IMAGES:-false}" == "true" ]]; then + log_info "Skipping GPU Operator (using Nebius driverfull images with pre-installed drivers)" + log_info "Installing NVIDIA device plugin for driverfull mode..." + + kubectl create namespace "${GPU_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + + # With driverfull images, we still need the GPU Operator for toolkit, device-plugin, + # dcgm, etc. - but driver installation is disabled. + helm upgrade --install gpu-operator nvidia/gpu-operator \ + --namespace "${GPU_OPERATOR_NAMESPACE}" \ + --values "${VALUES_DIR}/gpu-operator.yaml" \ + --set driver.enabled=false \ + --timeout 10m -kubectl create namespace "${GPU_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + log_success "GPU Operator deployed (driver disabled - using driverfull images)" +else + log_info "Deploying NVIDIA GPU Operator (with driver installation)..." -helm upgrade --install gpu-operator nvidia/gpu-operator \ - --namespace "${GPU_OPERATOR_NAMESPACE}" \ - --values "${VALUES_DIR}/gpu-operator.yaml" \ - --timeout 10m + kubectl create namespace "${GPU_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - -log_success "GPU Operator deployed (pods will become ready when GPU nodes are available)" + helm upgrade --install gpu-operator nvidia/gpu-operator \ + --namespace "${GPU_OPERATOR_NAMESPACE}" \ + --values "${VALUES_DIR}/gpu-operator.yaml" \ + --timeout 10m + + log_success "GPU Operator deployed (pods will become ready when GPU nodes are available)" +fi # Brief wait for core operator pod only (not GPU node components) sleep 10 diff --git a/applications/osmo/deploy/002-setup/defaults.sh b/applications/osmo/deploy/002-setup/defaults.sh index 14aae0071..548eafaca 100755 --- a/applications/osmo/deploy/002-setup/defaults.sh +++ b/applications/osmo/deploy/002-setup/defaults.sh @@ -23,7 +23,11 @@ export TOOLKIT_ENABLED="true" export DEVICE_PLUGIN_ENABLED="true" export MIG_MANAGER_ENABLED="false" -# Network Operator (only needed for InfiniBand/GPU clusters) +# Driverfull images (Nebius pre-installed NVIDIA drivers, skips GPU Operator driver) +# Recommended for B200/B300 GPUs where the GPU Operator's bundled driver may not support NVSwitch. +export USE_DRIVERFULL_IMAGES="false" # Set to "true" to use driverfull images + +# Network Operator (only needed for InfiniBand/GPU clusters without driverfull images) export ENABLE_NETWORK_OPERATOR="false" # Set to "true" if using InfiniBand # Observability settings diff --git a/applications/osmo/deploy/002-setup/values/network-operator.yaml b/applications/osmo/deploy/002-setup/values/network-operator.yaml index 146a9daca..eebf2e472 100755 --- a/applications/osmo/deploy/002-setup/values/network-operator.yaml +++ b/applications/osmo/deploy/002-setup/values/network-operator.yaml @@ -3,13 +3,8 @@ # Operator settings operator: - nodeSelector: - node-role.kubernetes.io/control-plane: "" tolerations: - - key: node-role.kubernetes.io/master - operator: Exists - effect: NoSchedule - - key: node-role.kubernetes.io/control-plane + - key: nvidia.com/gpu operator: Exists effect: NoSchedule From 527fd08cd70606c40f1bd98058aa7276f903b383 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Wed, 11 Feb 2026 12:41:53 +0100 Subject: [PATCH 10/37] - autodetect of gpu operator needed --- applications/osmo/deploy/001-iac/outputs.tf | 8 ++++++++ .../deploy/002-setup/01-deploy-gpu-infrastructure.sh | 9 +++++++++ applications/osmo/deploy/002-setup/defaults.sh | 2 +- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/applications/osmo/deploy/001-iac/outputs.tf b/applications/osmo/deploy/001-iac/outputs.tf index 101b52b25..0c9c8cc4f 100755 --- a/applications/osmo/deploy/001-iac/outputs.tf +++ b/applications/osmo/deploy/001-iac/outputs.tf @@ -132,6 +132,14 @@ output "wireguard" { } : null } +# ----------------------------------------------------------------------------- +# GPU Configuration Outputs +# ----------------------------------------------------------------------------- +output "gpu_nodes_driverfull_image" { + description = "Whether GPU nodes use driverfull images with pre-installed drivers" + value = var.gpu_nodes_driverfull_image +} + # ----------------------------------------------------------------------------- # Connection Instructions # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh b/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh index 4e110cae4..ac6289b7d 100755 --- a/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh +++ b/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh @@ -24,6 +24,15 @@ log_info "Adding Helm repositories..." helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --force-update helm repo update +# Auto-detect driverfull images from Terraform config +if [[ -z "${USE_DRIVERFULL_IMAGES:-}" ]]; then + TF_DRIVERFULL=$(get_tf_output "gpu_nodes_driverfull_image" "../001-iac" || echo "") + if [[ "$TF_DRIVERFULL" == "true" ]]; then + USE_DRIVERFULL_IMAGES="true" + log_info "Auto-detected driverfull images from Terraform" + fi +fi + # ----------------------------------------------------------------------------- # Deploy GPU Operator (skipped when using driverfull images) # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/002-setup/defaults.sh b/applications/osmo/deploy/002-setup/defaults.sh index 548eafaca..746f03206 100755 --- a/applications/osmo/deploy/002-setup/defaults.sh +++ b/applications/osmo/deploy/002-setup/defaults.sh @@ -25,7 +25,7 @@ export MIG_MANAGER_ENABLED="false" # Driverfull images (Nebius pre-installed NVIDIA drivers, skips GPU Operator driver) # Recommended for B200/B300 GPUs where the GPU Operator's bundled driver may not support NVSwitch. -export USE_DRIVERFULL_IMAGES="false" # Set to "true" to use driverfull images +export USE_DRIVERFULL_IMAGES="${USE_DRIVERFULL_IMAGES:-}" # Auto-detected from Terraform; set "true"/"false" to override # Network Operator (only needed for InfiniBand/GPU clusters without driverfull images) export ENABLE_NETWORK_OPERATOR="false" # Set to "true" if using InfiniBand From 0c769291bb630821f360f878b85ce106aec7c91f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Wed, 11 Feb 2026 13:11:11 +0100 Subject: [PATCH 11/37] - autodetect of gpu operator needed --- applications/osmo/deploy/002-setup/lib/common.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/applications/osmo/deploy/002-setup/lib/common.sh b/applications/osmo/deploy/002-setup/lib/common.sh index 68800b7f0..50fdc893b 100755 --- a/applications/osmo/deploy/002-setup/lib/common.sh +++ b/applications/osmo/deploy/002-setup/lib/common.sh @@ -27,6 +27,9 @@ log_error() { echo -e "${RED}[✗]${NC} $1" } +# Pause on error so the user can read the output before the terminal closes +trap '_exit_code=$?; if [[ $_exit_code -ne 0 ]]; then echo ""; log_error "Script failed (exit code $_exit_code). Press Enter to close..."; read -r; fi' EXIT + # Read input with a prompt into a variable (bash/zsh compatible). read_prompt_var() { local prompt=$1 From ea56119828c76001f2fc03c9ffba5bf125a3f458 Mon Sep 17 00:00:00 2001 From: Jathavan Sriram Date: Wed, 11 Feb 2026 13:24:40 +0100 Subject: [PATCH 12/37] Removal OSMO Proxy - usage of Nebius LB and NGNIX Ingress --- applications/osmo/README.md | 104 +++----- applications/osmo/deploy/001-iac/outputs.tf | 7 +- .../002-setup/02-deploy-observability.sh | 2 +- .../002-setup/03-deploy-nginx-ingress.sh | 82 ++++++ ...ane.sh => 04-deploy-osmo-control-plane.sh} | 248 +++++++++++++----- ...o-backend.sh => 05-deploy-osmo-backend.sh} | 30 ++- ...ure-storage.sh => 06-configure-storage.sh} | 6 +- ...ice-url.sh => 07-configure-service-url.sh} | 38 ++- ...atform.sh => 08-configure-gpu-platform.sh} | 0 applications/osmo/deploy/002-setup/README.md | 41 +-- .../cleanup/uninstall-nginx-ingress.sh | 11 + .../osmo/deploy/002-setup/defaults.sh | 8 + .../osmo/deploy/002-setup/lib/common.sh | 51 ++++ .../osmo/deploy/002-setup/nginx-proxy.yaml | 120 --------- applications/osmo/deploy/README.md | 16 +- 15 files changed, 463 insertions(+), 301 deletions(-) create mode 100755 applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh rename applications/osmo/deploy/002-setup/{03-deploy-osmo-control-plane.sh => 04-deploy-osmo-control-plane.sh} (87%) rename applications/osmo/deploy/002-setup/{04-deploy-osmo-backend.sh => 05-deploy-osmo-backend.sh} (92%) rename applications/osmo/deploy/002-setup/{05-configure-storage.sh => 06-configure-storage.sh} (96%) rename applications/osmo/deploy/002-setup/{06-configure-service-url.sh => 07-configure-service-url.sh} (69%) rename applications/osmo/deploy/002-setup/{07-configure-gpu-platform.sh => 08-configure-gpu-platform.sh} (100%) create mode 100755 applications/osmo/deploy/002-setup/cleanup/uninstall-nginx-ingress.sh delete mode 100755 applications/osmo/deploy/002-setup/nginx-proxy.yaml diff --git a/applications/osmo/README.md b/applications/osmo/README.md index 645de1eb2..01f1b11fb 100755 --- a/applications/osmo/README.md +++ b/applications/osmo/README.md @@ -207,22 +207,29 @@ See [Terraform README](deploy/001-iac/README.md) for configuration options, and - KAI Scheduler for GPU workload scheduling - Prometheus, Grafana, and Loki for monitoring -4. Deploy OSMO control plane: +4. Deploy NGINX Ingress Controller: ```bash - ./03-deploy-osmo-control-plane.sh + ./03-deploy-nginx-ingress.sh + ``` + + This deploys the community NGINX Ingress Controller with a LoadBalancer IP. It provides path-based routing to all OSMO services (API, router, Web UI). The LoadBalancer IP is auto-detected by later scripts. + +5. Deploy OSMO control plane: + ```bash + ./04-deploy-osmo-control-plane.sh ``` This deploys the core OSMO services: - Creates `osmo` namespace and PostgreSQL/MEK secrets - Initializes databases on Nebius Managed PostgreSQL - Deploys Redis and OSMO services (API, agent, worker, logger) - - Sets up nginx proxy for routing + - Creates Kubernetes Ingress resources for path-based routing via the NGINX Ingress Controller > **Note:** The script automatically retrieves PostgreSQL password and MEK from MysteryBox if you ran `secrets-init.sh` earlier. -5. Deploy OSMO backend operator: +7. Deploy OSMO backend operator: ```bash - ./04-deploy-osmo-backend.sh + ./05-deploy-osmo-backend.sh ``` The script automatically: @@ -239,22 +246,23 @@ See [Terraform README](deploy/001-iac/README.md) for configuration options, and > **Manual alternative:** If you prefer to create the token manually, set `OSMO_SERVICE_TOKEN` environment variable before running the script. -6. Verify backend deployment: +8. Verify backend deployment: - To verify the backend is registered with OSMO, start a port-forward and check: + Verify the backend is registered with OSMO using the NGINX Ingress LoadBalancer IP: ```bash - # Terminal 1: Start port-forward (keep running) - kubectl port-forward -n osmo svc/osmo-service 8080:80 + # Check backend registration + curl http:///api/configs/backend - # Terminal 2: Verify backend registration + # Or via OSMO CLI osmo config show BACKEND default ``` + The Ingress LoadBalancer IP is shown in the output of `04-deploy-osmo-control-plane.sh`. You should see the backend configuration with status `ONLINE`. -7. Configure OSMO storage: +9. Configure OSMO storage: ```bash - ./05-configure-storage.sh + ./06-configure-storage.sh ``` The script automatically: @@ -263,47 +271,40 @@ See [Terraform README](deploy/001-iac/README.md) for configuration options, and - Configures OSMO to use Nebius Object Storage for workflow artifacts - Verifies the configuration - > **Note:** The `osmo-storage` secret (with S3 credentials) was created automatically by `03-deploy-osmo-control-plane.sh`. + > **Note:** The `osmo-storage` secret (with S3 credentials) was created automatically by `04-deploy-osmo-control-plane.sh`. -8. Access OSMO (port-forwarding): - - Since the cluster uses private networking, use port-forwarding to access OSMO services: +10. Access OSMO (via NGINX Ingress LoadBalancer): + The NGINX Ingress Controller exposes OSMO via a LoadBalancer IP. The IP is shown in the output of `04-deploy-osmo-control-plane.sh`, or retrieve it with: ```bash - # Terminal 1: Forward OSMO API (required for CLI commands) - kubectl port-forward -n osmo svc/osmo-service 8080:80 - - # Terminal 2: Forward OSMO Web UI - kubectl port-forward -n osmo svc/osmo-ui 8081:80 + kubectl get svc -n ingress-nginx ingress-nginx-controller -o jsonpath='{.status.loadBalancer.ingress[0].ip}' ``` - Access points: - - **OSMO API**: http://localhost:8080 (for CLI and API calls) - - **OSMO Web UI**: http://localhost:8081 (browser-based dashboard) + Access points (replace `` with your LoadBalancer IP): + - **OSMO API**: `http:///api/version` + - **OSMO Web UI**: `http://` - Login to OSMO CLI (required before running commands): + Login to OSMO CLI: ```bash - osmo login http://localhost:8080 --method dev --username admin - ``` - -9. Configure service URL (required for workflows): - ```bash - ./06-configure-service-url.sh + osmo login http:// --method dev --username admin ``` - The script configures `service_base_url` which is required for: - - The `osmo-ctrl` sidecar to stream workflow logs - - Task status reporting and completion tracking - - Authentication token refresh during workflow execution - - > **Important:** Without this configuration, workflows will get stuck with `FETCH_FAILURE` errors. + > **Fallback:** If the LoadBalancer IP is not reachable, you can use port-forwarding: + > ```bash + > kubectl port-forward -n osmo svc/osmo-service 8080:80 + > osmo login http://localhost:8080 --method dev --username admin + > ``` + + > **Note:** The `service_base_url` (required for workflow execution) is automatically configured + > by `04-deploy-osmo-control-plane.sh` using the NGINX Ingress LoadBalancer IP. If you need to + > reconfigure it manually, run `./07-configure-service-url.sh`. -10. Configure pool for GPU workloads: +11. Configure pool for GPU workloads: The default pool needs GPU platform configuration to run GPU workflows. This creates a pod template with the correct node selector and tolerations for GPU nodes: ```bash - ./07-configure-gpu-platform.sh + ./08-configure-gpu-platform.sh ``` The script: @@ -317,27 +318,6 @@ See [Terraform README](deploy/001-iac/README.md) for configuration options, and osmo config show POD_TEMPLATE gpu_tolerations ``` -11. Set up port-forwarding for OSMO access: - - Before using the OSMO CLI or Web UI, set up port-forwarding to the OSMO services: - - ```bash - # Terminal 1: Port-forward to OSMO API (required for CLI and API access) - kubectl port-forward -n osmo svc/osmo-service 8080:80 - - # Terminal 2: Port-forward to OSMO Web UI (optional, for browser access) - kubectl port-forward -n osmo svc/osmo-ui 8081:80 - ``` - - Then configure the OSMO CLI to use the forwarded port: - ```bash - osmo profile set endpoint http://localhost:8080 - ``` - - Access points: - - **OSMO API**: http://localhost:8080 - - **OSMO Web UI**: http://localhost:8081 - 12. Run a test workflow (optional): Verify the complete setup by running a test workflow from the `workflows/osmo/` directory: @@ -356,12 +336,10 @@ See [Terraform README](deploy/001-iac/README.md) for configuration options, and osmo workflow list osmo workflow query - # View workflow logs (CLI - recommended when using port-forwarding) + # View workflow logs osmo workflow query --logs ``` - > **Note:** When using port-forwarding, the Web UI cannot display workflow logs (it tries to resolve internal Kubernetes DNS). Use the CLI commands above or `kubectl logs` instead. - Available test workflows in `workflows/osmo/`: - `hello_nebius.yaml` - Simple GPU hello world - `gpu_test.yaml` - GPU validation test diff --git a/applications/osmo/deploy/001-iac/outputs.tf b/applications/osmo/deploy/001-iac/outputs.tf index 101b52b25..e2447db08 100755 --- a/applications/osmo/deploy/001-iac/outputs.tf +++ b/applications/osmo/deploy/001-iac/outputs.tf @@ -153,10 +153,11 @@ output "next_steps" { cd ../002-setup ./01-deploy-gpu-infrastructure.sh ./02-deploy-observability.sh - ./03-deploy-osmo-control-plane.sh - ./04-deploy-osmo-backend.sh + ./03-deploy-nginx-ingress.sh + ./04-deploy-osmo-control-plane.sh + ./05-deploy-osmo-backend.sh - ${var.enable_managed_postgresql ? "PostgreSQL Connection (Managed):\n Host: ${module.platform.postgresql_host}\n Port: ${module.platform.postgresql_port}\n Database: ${module.platform.postgresql_database}\n Username: ${module.platform.postgresql_username}" : "PostgreSQL: Using in-cluster PostgreSQL (deployed via Helm in 03-deploy-osmo-control-plane.sh)"} + ${var.enable_managed_postgresql ? "PostgreSQL Connection (Managed):\n Host: ${module.platform.postgresql_host}\n Port: ${module.platform.postgresql_port}\n Database: ${module.platform.postgresql_database}\n Username: ${module.platform.postgresql_username}" : "PostgreSQL: Using in-cluster PostgreSQL (deployed via Helm in 04-deploy-osmo-control-plane.sh)"} Object Storage: Bucket: ${module.platform.storage_bucket_name} diff --git a/applications/osmo/deploy/002-setup/02-deploy-observability.sh b/applications/osmo/deploy/002-setup/02-deploy-observability.sh index ef3c22f13..cee09bac5 100755 --- a/applications/osmo/deploy/002-setup/02-deploy-observability.sh +++ b/applications/osmo/deploy/002-setup/02-deploy-observability.sh @@ -99,5 +99,5 @@ echo "Access Prometheus:" echo " kubectl port-forward -n ${MONITORING_NAMESPACE} svc/prometheus-kube-prometheus-prometheus 9090:9090" echo " URL: http://localhost:9090" echo "" -echo "Next step: ./03-deploy-osmo-control-plane.sh" +echo "Next step: ./03-deploy-nginx-ingress.sh" echo "" diff --git a/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh b/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh new file mode 100755 index 000000000..4da7c82a6 --- /dev/null +++ b/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# +# Deploy NGINX Ingress Controller (community) +# Provides path-based routing for all OSMO services (API, router, Web UI). +# +# This installs the same controller OSMO uses elsewhere: +# - OSMO quick-start chart (Chart.yaml) depends on ingress-nginx from the same Helm repo. +# - OSMO Kind runner (run/start_service_kind.py) installs ingress-nginx the same way. +# We do not use the quick-start umbrella chart here (Nebius uses managed DB, etc.), +# so we install the controller explicitly. Not a duplicate of OSMO—same upstream chart. +# +# Run before 04-deploy-osmo-control-plane.sh. +# See: https://kubernetes.github.io/ingress-nginx/deploy/ + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" + +INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}" +INGRESS_RELEASE_NAME="${INGRESS_RELEASE_NAME:-ingress-nginx}" + +echo "" +echo "========================================" +echo " NGINX Ingress Controller Deployment" +echo "========================================" +echo "" + +check_kubectl || exit 1 +check_helm || exit 1 + +# ----------------------------------------------------------------------------- +# Add Helm repo +# ----------------------------------------------------------------------------- +log_info "Adding ingress-nginx Helm repository..." +helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx --force-update +helm repo update + +# ----------------------------------------------------------------------------- +# Create namespace and install +# ----------------------------------------------------------------------------- +log_info "Creating namespace ${INGRESS_NAMESPACE}..." +kubectl create namespace "${INGRESS_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + +log_info "Installing NGINX Ingress Controller..." +helm upgrade --install "${INGRESS_RELEASE_NAME}" ingress-nginx/ingress-nginx \ + --namespace "${INGRESS_NAMESPACE}" \ + --set controller.service.type=LoadBalancer \ + --wait --timeout 5m || { + log_warning "Helm install returned non-zero; controller may still be starting." +} + +log_success "NGINX Ingress Controller deployed" + +# ----------------------------------------------------------------------------- +# Wait for LoadBalancer IP (optional; may take 1–2 min on cloud) +# ----------------------------------------------------------------------------- +log_info "Waiting for LoadBalancer IP (up to 120s)..." +for i in $(seq 1 24); do + LB_IP=$(kubectl get svc -n "${INGRESS_NAMESPACE}" -l app.kubernetes.io/name=ingress-nginx -o jsonpath='{.items[0].status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + if [[ -n "$LB_IP" ]]; then + log_success "LoadBalancer IP: ${LB_IP}" + echo "" + echo "OSMO will be accessible at:" + echo " http://${LB_IP}" + echo "" + echo "This URL is auto-detected by 04-deploy-osmo-control-plane.sh." + echo "" + break + fi + sleep 5 +done +if [[ -z "${LB_IP:-}" ]]; then + log_warning "LoadBalancer IP not yet assigned. Check: kubectl get svc -n ${INGRESS_NAMESPACE}" +fi + +echo "========================================" +log_success "NGINX Ingress deployment complete" +echo "========================================" +echo "" +echo "Next: run 04-deploy-osmo-control-plane.sh" +echo "" diff --git a/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/04-deploy-osmo-control-plane.sh similarity index 87% rename from applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh rename to applications/osmo/deploy/002-setup/04-deploy-osmo-control-plane.sh index c642c3014..266db7016 100755 --- a/applications/osmo/deploy/002-setup/03-deploy-osmo-control-plane.sh +++ b/applications/osmo/deploy/002-setup/04-deploy-osmo-control-plane.sh @@ -849,6 +849,16 @@ fi # ----------------------------------------------------------------------------- log_info "Creating OSMO values file..." +# NGINX Ingress – run 03-deploy-nginx-ingress.sh before this script +# When OSMO_INGRESS_HOSTNAME is empty (default), ingress matches any Host header, +# allowing direct IP-based access. Set it to a real domain for host-based routing. +INGRESS_HOSTNAME="${OSMO_INGRESS_HOSTNAME:-}" +if [[ -n "$INGRESS_HOSTNAME" ]]; then + log_info "Ingress hostname: ${INGRESS_HOSTNAME}" +else + log_info "Ingress hostname: (any — IP-based access)" +fi + # Create the values file with proper extraEnv and extraVolumes for each service # This configures PostgreSQL password via env var and MEK via volume mount cat > /tmp/osmo_values.yaml </dev/null || true - log_success "OSMO Service Helm deployment complete" # ----------------------------------------------------------------------------- @@ -1097,54 +1116,56 @@ kubectl create secret generic db-secret \ --from-literal=db-password="${POSTGRES_PASSWORD}" \ --dry-run=client -o yaml | kubectl apply -f - +ROUTER_HELM_ARGS=( + --namespace "${OSMO_NAMESPACE}" + --set service.type=ClusterIP + --set services.configFile.enabled=true + --set "services.postgres.serviceName=${POSTGRES_HOST}" + --set "services.postgres.port=${POSTGRES_PORT}" + --set services.postgres.db=osmo + --set "services.postgres.user=${POSTGRES_USER}" + --set services.service.ingress.enabled=true + --set services.service.ingress.ingressClass=nginx + --set services.service.ingress.sslEnabled=false + --set services.service.scaling.minReplicas=1 + --set services.service.scaling.maxReplicas=1 + --set sidecars.envoy.enabled=false + --set sidecars.logAgent.enabled=false +) +[[ -n "$INGRESS_HOSTNAME" ]] && ROUTER_HELM_ARGS+=(--set "services.service.hostname=${INGRESS_HOSTNAME}" --set "global.domain=${INGRESS_HOSTNAME}") + helm upgrade --install osmo-router osmo/router \ - --namespace "${OSMO_NAMESPACE}" \ - --set service.type=ClusterIP \ - --set global.domain=osmo.local \ - --set services.configFile.enabled=true \ - --set services.postgres.serviceName="${POSTGRES_HOST}" \ - --set services.postgres.port=${POSTGRES_PORT} \ - --set services.postgres.db=osmo \ - --set services.postgres.user="${POSTGRES_USER}" \ - --set services.service.ingress.enabled=false \ - --set services.service.scaling.minReplicas=1 \ - --set services.service.scaling.maxReplicas=1 \ - --set sidecars.envoy.enabled=false \ - --set sidecars.logAgent.enabled=false \ + "${ROUTER_HELM_ARGS[@]}" \ --wait --timeout 5m || log_warning "Router deployment had issues" log_success "OSMO Router deployed" -# Delete router ingress -kubectl delete ingress -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/instance=osmo-router --ignore-not-found 2>/dev/null || true - # ----------------------------------------------------------------------------- # Step 8: Deploy Web UI (Optional) # ----------------------------------------------------------------------------- if [[ "${DEPLOY_UI:-true}" == "true" ]]; then log_info "Deploying OSMO Web UI..." + UI_HELM_ARGS=( + --namespace "${OSMO_NAMESPACE}" + --set services.ui.service.type=ClusterIP + --set services.ui.ingress.enabled=true + --set services.ui.ingress.ingressClass=nginx + --set services.ui.ingress.sslEnabled=false + --set services.ui.replicas=1 + --set "services.ui.apiHostname=osmo-service.${OSMO_NAMESPACE}.svc.cluster.local:80" + --set sidecars.envoy.enabled=false + --set sidecars.logAgent.enabled=false + ) + [[ -n "$INGRESS_HOSTNAME" ]] && UI_HELM_ARGS+=(--set "services.ui.hostname=${INGRESS_HOSTNAME}" --set "global.domain=${INGRESS_HOSTNAME}") + helm upgrade --install osmo-ui osmo/web-ui \ - --namespace "${OSMO_NAMESPACE}" \ - --set services.ui.service.type=LoadBalancer \ - --set global.domain=osmo.local \ - --set services.ui.ingress.enabled=false \ - --set services.ui.replicas=1 \ - --set services.ui.apiHostname="osmo-service.${OSMO_NAMESPACE}.svc.cluster.local:80" \ - --set sidecars.envoy.enabled=false \ - --set sidecars.logAgent.enabled=false \ + "${UI_HELM_ARGS[@]}" \ --wait --timeout 5m || log_warning "UI deployment had issues" log_success "OSMO Web UI deployed" - - # Delete UI ingress - kubectl delete ingress -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/instance=osmo-ui --ignore-not-found 2>/dev/null || true fi -# Cleanup all remaining ingress resources (final sweep) -log_info "Final cleanup of any remaining Ingress resources..." -kubectl delete ingress --all -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null || true - # Cleanup temp files rm -f /tmp/osmo_values.yaml @@ -1233,24 +1254,7 @@ done log_success "Service ports verified" # ----------------------------------------------------------------------------- -# Step 11: Deploy NGINX Proxy -# ----------------------------------------------------------------------------- -# The nginx proxy routes traffic to osmo-service, osmo-logger, and osmo-agent -# Required for osmo-ctrl sidecar to communicate with the OSMO service -log_info "Deploying OSMO proxy (nginx)..." - -if [[ -f "${SCRIPT_DIR}/nginx-proxy.yaml" ]]; then - kubectl apply -f "${SCRIPT_DIR}/nginx-proxy.yaml" - kubectl rollout status deployment/osmo-proxy -n "${OSMO_NAMESPACE}" --timeout=120s || \ - log_warning "Timeout waiting for osmo-proxy rollout" - log_success "OSMO proxy deployed" -else - log_warning "nginx-proxy.yaml not found - skipping proxy deployment" - log_warning "Workflows may fail without the proxy. Create nginx-proxy.yaml and apply manually." -fi - -# ----------------------------------------------------------------------------- -# Step 12: Verify Deployment +# Step 11: Verify Deployment # ----------------------------------------------------------------------------- echo "" log_info "Verifying deployment configuration..." @@ -1282,26 +1286,129 @@ echo "" echo "Services:" kubectl get svc -n "${OSMO_NAMESPACE}" -# Get service URL -OSMO_SVC=$(kubectl get svc -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/name=service -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "osmo-service") -OSMO_PORT=$(kubectl get svc "${OSMO_SVC}" -n "${OSMO_NAMESPACE}" -o jsonpath='{.spec.ports[0].port}' 2>/dev/null || echo "80") +# ----------------------------------------------------------------------------- +# Step 12: Configure service_base_url (required for workflow execution) +# ----------------------------------------------------------------------------- +# The osmo-ctrl sidecar in every workflow pod needs service_base_url to +# stream logs, report task status, and refresh tokens. +# This is an application-level config that must be set via the OSMO API. + +echo "" +log_info "Configuring service_base_url for workflow execution..." + +# Detect target URL from Ingress +INGRESS_URL=$(detect_service_url 2>/dev/null || true) + +if [[ -n "${OSMO_INGRESS_BASE_URL:-}" ]]; then + TARGET_SERVICE_URL="${OSMO_INGRESS_BASE_URL}" + log_info "Using explicit Ingress base URL: ${TARGET_SERVICE_URL}" +elif [[ -n "$INGRESS_URL" ]]; then + TARGET_SERVICE_URL="${INGRESS_URL}" + log_info "Auto-detected service URL: ${TARGET_SERVICE_URL}" +else + log_warning "Could not detect Ingress URL. Skipping service_base_url configuration." + log_warning "Run ./07-configure-service-url.sh manually after verifying the Ingress." + TARGET_SERVICE_URL="" +fi + +if [[ -n "$TARGET_SERVICE_URL" ]]; then + # Start port-forward to access the OSMO API + log_info "Starting port-forward to configure service_base_url..." + kubectl port-forward -n "${OSMO_NAMESPACE}" svc/osmo-service 8080:80 &>/dev/null & + _PF_PID=$! + + _cleanup_pf() { + if [[ -n "${_PF_PID:-}" ]]; then + kill $_PF_PID 2>/dev/null || true + wait $_PF_PID 2>/dev/null || true + fi + } + + # Wait for port-forward to be ready + _pf_ready=false + for i in $(seq 1 30); do + if curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; then + _pf_ready=true + break + fi + sleep 1 + done + + if [[ "$_pf_ready" == "true" ]]; then + # Login + if osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then + # Check current value + CURRENT_SVC_URL=$(curl -s "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') + + if [[ "$CURRENT_SVC_URL" == "$TARGET_SERVICE_URL" ]]; then + log_success "service_base_url already configured: ${CURRENT_SVC_URL}" + else + if [[ -n "$CURRENT_SVC_URL" && "$CURRENT_SVC_URL" != "null" ]]; then + log_warning "Updating service_base_url from '${CURRENT_SVC_URL}' to '${TARGET_SERVICE_URL}'" + fi + + # Write config + cat > /tmp/service_url_fix.json << SVCEOF +{ + "service_base_url": "${TARGET_SERVICE_URL}" +} +SVCEOF + if osmo config update SERVICE --file /tmp/service_url_fix.json --description "Set service_base_url for osmo-ctrl sidecar" 2>/dev/null; then + # Verify + NEW_SVC_URL=$(curl -s "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') + if [[ "$NEW_SVC_URL" == "$TARGET_SERVICE_URL" ]]; then + log_success "service_base_url configured: ${NEW_SVC_URL}" + else + log_warning "service_base_url verification failed. Run ./07-configure-service-url.sh manually." + fi + else + log_warning "Failed to set service_base_url. Run ./07-configure-service-url.sh manually." + fi + rm -f /tmp/service_url_fix.json + fi + else + log_warning "Could not login to OSMO. Run ./07-configure-service-url.sh manually." + fi + else + log_warning "Port-forward not ready. Run ./07-configure-service-url.sh manually." + fi + + _cleanup_pf +fi echo "" echo "========================================" -log_success "OSMO Service deployment complete!" +log_success "OSMO Control Plane deployment complete!" echo "========================================" echo "" -echo "OSMO Service Access:" -echo " kubectl port-forward -n ${OSMO_NAMESPACE} svc/${OSMO_SVC} 8080:${OSMO_PORT}" -echo " URL: http://localhost:8080" -echo "" -echo "" + +if [[ -n "$INGRESS_URL" ]]; then + echo "OSMO Access (via NGINX Ingress LoadBalancer):" + echo " OSMO API: ${INGRESS_URL}/api/version" + echo " OSMO UI: ${INGRESS_URL}" + echo " OSMO CLI: osmo login ${INGRESS_URL} --method dev --username admin" + echo "" +else + log_warning "Could not detect Ingress LoadBalancer IP." + echo " Check: kubectl get svc -n ${INGRESS_NAMESPACE:-ingress-nginx}" + echo "" + echo " Fallback (port-forward):" + echo " kubectl port-forward -n ${OSMO_NAMESPACE} svc/osmo-service 8080:80" + echo " URL: http://localhost:8080" + echo "" +fi + echo "NOTE: OSMO API authentication is DISABLED for testing." echo " The API is accessible without tokens." echo "" echo "Test the API:" -echo " curl http://localhost:8080/api/version" -echo " curl http://localhost:8080/api/workflow" +if [[ -n "$INGRESS_URL" ]]; then + echo " curl ${INGRESS_URL}/api/version" + echo " curl ${INGRESS_URL}/api/workflow" +else + echo " curl http://localhost:8080/api/version" + echo " curl http://localhost:8080/api/workflow" +fi echo "" if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then echo "Keycloak Access (for future use):" @@ -1311,8 +1418,9 @@ if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then echo " Test User: osmo-admin / osmo-admin" echo "" fi -echo "Next step - Deploy Backend Operator:" -echo " ./04-deploy-osmo-backend.sh" +echo "Ingress resources:" +kubectl get ingress -n "${OSMO_NAMESPACE}" 2>/dev/null || true echo "" -echo "In-cluster URL (for pods): http://${OSMO_SVC}.${OSMO_NAMESPACE}.svc.cluster.local:${OSMO_PORT}" +echo "Next step - Deploy Backend Operator:" +echo " ./05-deploy-osmo-backend.sh" echo "" diff --git a/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh b/applications/osmo/deploy/002-setup/05-deploy-osmo-backend.sh similarity index 92% rename from applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh rename to applications/osmo/deploy/002-setup/05-deploy-osmo-backend.sh index 23c8fa47b..978d2bfed 100755 --- a/applications/osmo/deploy/002-setup/04-deploy-osmo-backend.sh +++ b/applications/osmo/deploy/002-setup/05-deploy-osmo-backend.sh @@ -50,7 +50,7 @@ if [[ -z "${OSMO_SERVICE_URL:-}" ]]; then log_success "In-cluster Agent URL: ${OSMO_SERVICE_URL}" else echo "" - log_error "Could not detect OSMO Agent service. Deploy OSMO first: ./03-deploy-osmo-control-plane.sh" + log_error "Could not detect OSMO Agent service. Deploy OSMO first: ./04-deploy-osmo-control-plane.sh" log_error "Note: Backend operators require osmo-agent service for WebSocket connections" exit 1 fi @@ -276,14 +276,28 @@ echo "" echo "Backend Name: ${BACKEND_NAME}" echo "Agent URL (WebSocket): ${OSMO_SERVICE_URL}" echo "" +# Detect Ingress URL for verification instructions +INGRESS_URL=$(detect_service_url 2>/dev/null || true) + echo "To verify the backend registration:" echo "" -echo " Terminal 1 - Start port-forward (keep running):" -echo " kubectl port-forward -n osmo svc/osmo-service 8080:80" -echo "" -echo " Terminal 2 - Check backend status:" -echo " osmo config show BACKEND ${BACKEND_NAME}" +if [[ -n "$INGRESS_URL" ]]; then + echo " Check backend status:" + echo " osmo config show BACKEND ${BACKEND_NAME}" + echo "" + echo " Or via curl (using NGINX Ingress LoadBalancer):" + echo " curl ${INGRESS_URL}/api/configs/backend" +else + echo " Terminal 1 - Start port-forward (keep running):" + echo " kubectl port-forward -n osmo svc/osmo-service 8080:80" + echo "" + echo " Terminal 2 - Check backend status:" + echo " osmo config show BACKEND ${BACKEND_NAME}" + echo "" + echo " Or via curl:" + echo " curl http://localhost:8080/api/configs/backend" +fi echo "" -echo " Or via curl:" -echo " curl http://localhost:8080/api/configs/backend" +echo "Next step - Configure Storage:" +echo " ./06-configure-storage.sh" echo "" diff --git a/applications/osmo/deploy/002-setup/05-configure-storage.sh b/applications/osmo/deploy/002-setup/06-configure-storage.sh similarity index 96% rename from applications/osmo/deploy/002-setup/05-configure-storage.sh rename to applications/osmo/deploy/002-setup/06-configure-storage.sh index 47e7a9d53..9b455a281 100755 --- a/applications/osmo/deploy/002-setup/05-configure-storage.sh +++ b/applications/osmo/deploy/002-setup/06-configure-storage.sh @@ -64,7 +64,7 @@ if ! kubectl get secret osmo-storage -n osmo &>/dev/null; then if [[ -z "$S3_ACCESS_KEY" || -z "$S3_SECRET_KEY" ]]; then log_error "Could not retrieve storage credentials" echo "" - echo "Either re-run 03-deploy-osmo-control-plane.sh or create the secret manually:" + echo "Either re-run 04-deploy-osmo-control-plane.sh or create the secret manually:" echo "" echo " kubectl create secret generic osmo-storage \\" echo " --namespace osmo \\" @@ -174,7 +174,7 @@ EOF echo "$WORKFLOW_LOG_CONFIG" > /tmp/workflow_log_config.json # Use EDITOR='tee' trick to bypass interactive editor -if echo 'Configure workflow log storage' | EDITOR='tee' osmo config update WORKFLOW --file /tmp/workflow_log_config.json 2>/dev/null; then +if osmo config update WORKFLOW --file /tmp/workflow_log_config.json --description "Configure workflow log storage" 2>/dev/null; then log_success "Workflow log storage configured" else log_error "Failed to configure workflow log storage" @@ -206,7 +206,7 @@ EOF echo "$WORKFLOW_DATA_CONFIG" > /tmp/workflow_data_config.json # Use EDITOR='tee' trick to bypass interactive editor -if echo 'Configure workflow data storage' | EDITOR='tee' osmo config update WORKFLOW --file /tmp/workflow_data_config.json 2>/dev/null; then +if osmo config update WORKFLOW --file /tmp/workflow_data_config.json --description "Configure workflow data storage" 2>/dev/null; then log_success "Workflow data storage configured" else log_error "Failed to configure workflow data storage" diff --git a/applications/osmo/deploy/002-setup/06-configure-service-url.sh b/applications/osmo/deploy/002-setup/07-configure-service-url.sh similarity index 69% rename from applications/osmo/deploy/002-setup/06-configure-service-url.sh rename to applications/osmo/deploy/002-setup/07-configure-service-url.sh index 76c4ee481..2ec685d04 100755 --- a/applications/osmo/deploy/002-setup/06-configure-service-url.sh +++ b/applications/osmo/deploy/002-setup/07-configure-service-url.sh @@ -8,6 +8,7 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" echo "" echo "========================================" @@ -56,6 +57,27 @@ if ! osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; fi log_success "Logged in successfully" +# ----------------------------------------------------------------------------- +# Determine the target service URL +# ----------------------------------------------------------------------------- +log_info "Determining target service URL..." + +# Priority: +# 1. Explicit OSMO_INGRESS_BASE_URL (user override) +# 2. Auto-detect from NGINX Ingress Controller LoadBalancer +if [[ -n "${OSMO_INGRESS_BASE_URL:-}" ]]; then + SERVICE_URL="${OSMO_INGRESS_BASE_URL}" + log_info "Using explicit Ingress base URL: ${SERVICE_URL}" +elif DETECTED_URL=$(detect_service_url 2>/dev/null) && [[ -n "$DETECTED_URL" ]]; then + SERVICE_URL="${DETECTED_URL}" + log_info "Auto-detected service URL: ${SERVICE_URL}" +else + log_error "Could not detect NGINX Ingress Controller URL." + log_error "Ensure 03-deploy-nginx-ingress.sh was run and the LoadBalancer has an IP." + log_error "Or set OSMO_INGRESS_BASE_URL manually: export OSMO_INGRESS_BASE_URL=http://" + exit 1 +fi + # ----------------------------------------------------------------------------- # Check current service_base_url # ----------------------------------------------------------------------------- @@ -64,22 +86,20 @@ log_info "Checking current service_base_url..." CURRENT_URL=$(curl -s "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') echo "Current service_base_url: '${CURRENT_URL}'" -if [[ -n "$CURRENT_URL" && "$CURRENT_URL" != "null" ]]; then - log_success "service_base_url is already configured: ${CURRENT_URL}" - echo "" - echo "To reconfigure, delete the current value first or update manually." +if [[ -n "$CURRENT_URL" && "$CURRENT_URL" != "null" && "$CURRENT_URL" == "$SERVICE_URL" ]]; then + log_success "service_base_url is already correctly configured: ${CURRENT_URL}" cleanup_port_forward trap - EXIT exit 0 +elif [[ -n "$CURRENT_URL" && "$CURRENT_URL" != "null" ]]; then + log_warning "service_base_url is set to '${CURRENT_URL}' but should be '${SERVICE_URL}'" + log_info "Updating service_base_url..." fi # ----------------------------------------------------------------------------- # Configure service_base_url # ----------------------------------------------------------------------------- -log_info "Configuring service_base_url..." - -# The osmo-ctrl sidecar needs to connect to the OSMO service via the proxy -SERVICE_URL="http://osmo-proxy.osmo.svc.cluster.local:80" +log_info "Configuring service_base_url to: ${SERVICE_URL}" cat > /tmp/service_url_fix.json << EOF { @@ -87,7 +107,7 @@ cat > /tmp/service_url_fix.json << EOF } EOF -if echo 'Configure service URL' | EDITOR='tee' osmo config update SERVICE --file /tmp/service_url_fix.json 2>/dev/null; then +if osmo config update SERVICE --file /tmp/service_url_fix.json --description "Set service_base_url for osmo-ctrl sidecar" 2>/dev/null; then log_success "service_base_url configured" else log_error "Failed to configure service_base_url" diff --git a/applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh b/applications/osmo/deploy/002-setup/08-configure-gpu-platform.sh similarity index 100% rename from applications/osmo/deploy/002-setup/07-configure-gpu-platform.sh rename to applications/osmo/deploy/002-setup/08-configure-gpu-platform.sh diff --git a/applications/osmo/deploy/002-setup/README.md b/applications/osmo/deploy/002-setup/README.md index fab486567..05ec8b55c 100755 --- a/applications/osmo/deploy/002-setup/README.md +++ b/applications/osmo/deploy/002-setup/README.md @@ -21,20 +21,20 @@ Run scripts in order: # 2. Observability (Prometheus, Grafana, Loki) ./02-deploy-observability.sh -# 3. OSMO Control Plane -./03-deploy-osmo-control-plane.sh +# 3. NGINX Ingress Controller (required – provides routing for OSMO services) +./03-deploy-nginx-ingress.sh -# 4. OSMO Backend -./04-deploy-osmo-backend.sh +# 4. OSMO Control Plane +./04-deploy-osmo-control-plane.sh -# 5. Configure Storage (requires port-forward, see main README) -./05-configure-storage.sh +# 5. OSMO Backend +./05-deploy-osmo-backend.sh -# 6. Configure Service URL (required for workflows) -./06-configure-service-url.sh +# 6. Configure Storage (requires port-forward, see main README) +./06-configure-storage.sh # 7. Configure GPU Platform (required for GPU workflows) -./07-configure-gpu-platform.sh +./08-configure-gpu-platform.sh ``` ## Scripts @@ -43,11 +43,12 @@ Run scripts in order: |--------|---------|----------| | `01-deploy-gpu-infrastructure.sh` | GPU Operator, Network Operator, KAI Scheduler | ~15 min | | `02-deploy-observability.sh` | Prometheus, Grafana, Loki, Promtail | ~10 min | -| `03-deploy-osmo-control-plane.sh` | OSMO Control Plane, nginx proxy, database secrets | ~5 min | -| `04-deploy-osmo-backend.sh` | OSMO Backend operator | ~5 min | -| `05-configure-storage.sh` | Configure S3-compatible storage for workflow logs/data | ~1 min | -| `06-configure-service-url.sh` | Configure service URL for osmo-ctrl sidecar | ~1 min | -| `07-configure-gpu-platform.sh` | Configure GPU platform with tolerations/node selector | ~1 min | +| `03-deploy-nginx-ingress.sh` | NGINX Ingress Controller (routing for OSMO services) | ~2 min | +| `04-deploy-osmo-control-plane.sh` | OSMO Control Plane, Ingress resources, database secrets, service URL | ~5 min | +| `05-deploy-osmo-backend.sh` | OSMO Backend operator | ~5 min | +| `06-configure-storage.sh` | Configure S3-compatible storage for workflow logs/data | ~1 min | +| `07-configure-service-url.sh` | Reconfigure service URL manually (usually not needed) | ~1 min | +| `08-configure-gpu-platform.sh` | Configure GPU platform with tolerations/node selector | ~1 min | ## Configuration @@ -66,7 +67,7 @@ Customize deployments by editing files in `values/`: ### Environment Variables -Configure via `defaults.sh`: +Configure via `defaults.sh` or export before running: ```bash # Namespaces @@ -77,6 +78,10 @@ OSMO_NAMESPACE="osmo" # Grafana password (auto-generated if empty) GRAFANA_ADMIN_PASSWORD="" + +# NGINX Ingress (deploy 03-deploy-nginx-ingress.sh before 04-deploy-osmo-control-plane.sh) +OSMO_INGRESS_HOSTNAME="" # hostname for Ingress rules (e.g. osmo.example.com); leave empty for IP-based access +OSMO_INGRESS_BASE_URL="" # override for service_base_url; auto-detected from LoadBalancer if empty ``` ### Secrets from MysteryBox @@ -88,7 +93,7 @@ If you ran `secrets-init.sh` in the prerequisites step, the following environmen | `TF_VAR_postgresql_mysterybox_secret_id` | MysteryBox secret ID for PostgreSQL password | | `TF_VAR_mek_mysterybox_secret_id` | MysteryBox secret ID for MEK (Master Encryption Key) | -The `03-deploy-osmo-control-plane.sh` script automatically reads these secrets from MysteryBox. This keeps sensitive credentials out of Terraform state and provides a secure secrets management workflow. +The `04-deploy-osmo-control-plane.sh` script automatically reads these secrets from MysteryBox. This keeps sensitive credentials out of Terraform state and provides a secure secrets management workflow. **Secret retrieval order:** 1. **MysteryBox** (if secret ID is set via `TF_VAR_*` or `OSMO_*` env vars) @@ -173,7 +178,7 @@ Nebius GPU nodes have a taint `nvidia.com/gpu=true:NoSchedule` that prevents pod ### Option 1: Run the Configuration Script (Recommended) ```bash -./07-configure-gpu-platform.sh +./08-configure-gpu-platform.sh ``` ### Option 2: Manual Configuration via API @@ -349,7 +354,7 @@ If OSMO shows 0 GPUs or GPU workflows fail to schedule: 4. If missing, run the GPU configuration: ```bash - ./07-configure-gpu-platform.sh + ./08-configure-gpu-platform.sh ``` 5. Verify OSMO sees GPU resources: diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-nginx-ingress.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-nginx-ingress.sh new file mode 100755 index 000000000..f9f6d43d6 --- /dev/null +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-nginx-ingress.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Uninstall NGINX Ingress Controller (deployed by 03-deploy-nginx-ingress.sh) +set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}" +INGRESS_RELEASE_NAME="${INGRESS_RELEASE_NAME:-ingress-nginx}" +log_info "Uninstalling NGINX Ingress Controller..." +helm uninstall "${INGRESS_RELEASE_NAME}" -n "${INGRESS_NAMESPACE}" 2>/dev/null || true +kubectl delete namespace "${INGRESS_NAMESPACE}" --ignore-not-found --timeout=60s 2>/dev/null || true +log_success "NGINX Ingress Controller uninstalled" diff --git a/applications/osmo/deploy/002-setup/defaults.sh b/applications/osmo/deploy/002-setup/defaults.sh index 14aae0071..fd6062a5c 100755 --- a/applications/osmo/deploy/002-setup/defaults.sh +++ b/applications/osmo/deploy/002-setup/defaults.sh @@ -31,6 +31,14 @@ export PROMETHEUS_RETENTION_DAYS="15" export LOKI_RETENTION_DAYS="7" export GRAFANA_ADMIN_PASSWORD="" # Auto-generated if empty +# NGINX Ingress Controller (deployed by 03-deploy-nginx-ingress.sh) +# Namespace where the NGINX Ingress Controller is deployed. +export INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}" +# Hostname for Ingress rules (e.g. osmo.example.com). Leave empty to use the LoadBalancer IP directly. +export OSMO_INGRESS_HOSTNAME="${OSMO_INGRESS_HOSTNAME:-}" +# Override for the service_base_url used by osmo-ctrl. Auto-detected from the ingress LoadBalancer if empty. +export OSMO_INGRESS_BASE_URL="${OSMO_INGRESS_BASE_URL:-}" + # Paths export SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" export VALUES_DIR="${SCRIPT_DIR}/values" diff --git a/applications/osmo/deploy/002-setup/lib/common.sh b/applications/osmo/deploy/002-setup/lib/common.sh index 68800b7f0..50dc47dfb 100755 --- a/applications/osmo/deploy/002-setup/lib/common.sh +++ b/applications/osmo/deploy/002-setup/lib/common.sh @@ -197,6 +197,57 @@ wait_for_pods() { --timeout=10s } +# Detect OSMO service URL from the NGINX Ingress Controller's LoadBalancer. +# Mirrors the Azure reference architecture's detect_service_url() pattern. +# +# Lookup order: +# 1. LoadBalancer external IP (cloud assigns a public/internal IP) +# 2. LoadBalancer hostname (some clouds return a DNS name instead) +# 3. Controller ClusterIP (fallback – works from inside the cluster) +# +# Usage: +# url=$(detect_service_url) +# [[ -n "$url" ]] && echo "OSMO reachable at $url" +detect_service_url() { + local ns="${INGRESS_NAMESPACE:-ingress-nginx}" + local url="" + + # Find the controller service (works for the community ingress-nginx chart) + local lb_ip lb_host cluster_ip svc_name + svc_name=$(kubectl get svc -n "$ns" \ + -l app.kubernetes.io/name=ingress-nginx,app.kubernetes.io/component=controller \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) + + if [[ -n "$svc_name" ]]; then + # 1. LoadBalancer IP + lb_ip=$(kubectl get svc "$svc_name" -n "$ns" \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + if [[ -n "$lb_ip" ]]; then + echo "http://${lb_ip}" + return 0 + fi + + # 2. LoadBalancer hostname (e.g. ELB on AWS) + lb_host=$(kubectl get svc "$svc_name" -n "$ns" \ + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || true) + if [[ -n "$lb_host" ]]; then + echo "http://${lb_host}" + return 0 + fi + + # 3. ClusterIP of the controller + cluster_ip=$(kubectl get svc "$svc_name" -n "$ns" \ + -o jsonpath='{.spec.clusterIP}' 2>/dev/null || true) + if [[ -n "$cluster_ip" && "$cluster_ip" != "None" ]]; then + echo "http://${cluster_ip}" + return 0 + fi + fi + + # Nothing found + return 1 +} + # Get Terraform output (supports nested values like "postgresql.host") get_tf_output() { local name=$1 diff --git a/applications/osmo/deploy/002-setup/nginx-proxy.yaml b/applications/osmo/deploy/002-setup/nginx-proxy.yaml deleted file mode 100755 index b8eab7837..000000000 --- a/applications/osmo/deploy/002-setup/nginx-proxy.yaml +++ /dev/null @@ -1,120 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: osmo-proxy-nginx-config - namespace: osmo -data: - nginx.conf: | - events { - worker_connections 1024; - } - - http { - # Logging - access_log /dev/stdout; - error_log /dev/stderr; - - # Conditional WebSocket support - # Sets Connection header to "upgrade" for WebSocket requests, "close" otherwise - # This is important for proper handling of both WebSocket and regular HTTP requests - map $http_upgrade $connection_upgrade { - default upgrade; - '' close; - } - - # Upstream servers - upstream osmo-service { - server osmo-service.osmo.svc.cluster.local:80; - } - - upstream osmo-logger { - server osmo-logger.osmo.svc.cluster.local:80; - } - - upstream osmo-agent { - server osmo-agent.osmo.svc.cluster.local:80; - } - - server { - listen 80; - - # Common proxy headers - proxy_http_version 1.1; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - - # WebSocket support (conditional based on Upgrade header) - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection $connection_upgrade; - - # Timeouts for long-running WebSocket connections (osmo-ctrl logging) - proxy_read_timeout 3600s; - proxy_send_timeout 3600s; - - # Route /api/logger/* to osmo-logger (WebSocket for log streaming) - location /api/logger/ { - proxy_pass http://osmo-logger; - } - - # Route /api/agent/* to osmo-agent (WebSocket for backend communication) - location /api/agent/ { - proxy_pass http://osmo-agent; - } - - # Everything else to osmo-service (REST API) - location / { - proxy_pass http://osmo-service; - } - } - } ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: osmo-proxy - namespace: osmo -spec: - replicas: 1 - selector: - matchLabels: - app: osmo-proxy - template: - metadata: - labels: - app: osmo-proxy - spec: - containers: - - name: nginx - image: nginx:alpine - ports: - - containerPort: 80 - volumeMounts: - - name: nginx-config - mountPath: /etc/nginx/nginx.conf - subPath: nginx.conf - resources: - requests: - cpu: 50m - memory: 64Mi - limits: - cpu: 200m - memory: 128Mi - volumes: - - name: nginx-config - configMap: - name: osmo-proxy-nginx-config ---- -apiVersion: v1 -kind: Service -metadata: - name: osmo-proxy - namespace: osmo -spec: - selector: - app: osmo-proxy - ports: - - port: 80 - targetPort: 80 - type: ClusterIP diff --git a/applications/osmo/deploy/README.md b/applications/osmo/deploy/README.md index 21258c450..6264fc2f0 100755 --- a/applications/osmo/deploy/README.md +++ b/applications/osmo/deploy/README.md @@ -72,11 +72,14 @@ cd 002-setup # 2. Deploy observability stack ./02-deploy-observability.sh -# 3. Deploy OSMO control plane -./03-deploy-osmo-control-plane.sh +# 3. Deploy NGINX Ingress Controller +./03-deploy-nginx-ingress.sh -# 4. Deploy OSMO backend -./04-deploy-osmo-backend.sh +# 4. Deploy OSMO control plane +./04-deploy-osmo-control-plane.sh + +# 5. Deploy OSMO backend +./05-deploy-osmo-backend.sh ``` ## Directory Structure @@ -106,8 +109,9 @@ deploy/ ├── values/ # Helm values files ├── 01-deploy-gpu-infrastructure.sh ├── 02-deploy-observability.sh - ├── 03-deploy-osmo-control-plane.sh - ├── 04-deploy-osmo-backend.sh + ├── 03-deploy-nginx-ingress.sh + ├── 04-deploy-osmo-control-plane.sh + ├── 05-deploy-osmo-backend.sh ├── cleanup/ # Uninstall scripts └── README.md ``` From 71b874ff49c410468df1a07ae80c0fed9927ab3c Mon Sep 17 00:00:00 2001 From: Jathavan Sriram Date: Wed, 11 Feb 2026 13:25:54 +0100 Subject: [PATCH 13/37] Removal OSMO Proxy - usage of Nebius LB and NGNIX Ingress --- applications/osmo/deploy/002-setup/lib/common.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/applications/osmo/deploy/002-setup/lib/common.sh b/applications/osmo/deploy/002-setup/lib/common.sh index 50dc47dfb..a6882922a 100755 --- a/applications/osmo/deploy/002-setup/lib/common.sh +++ b/applications/osmo/deploy/002-setup/lib/common.sh @@ -198,7 +198,6 @@ wait_for_pods() { } # Detect OSMO service URL from the NGINX Ingress Controller's LoadBalancer. -# Mirrors the Azure reference architecture's detect_service_url() pattern. # # Lookup order: # 1. LoadBalancer external IP (cloud assigns a public/internal IP) From 80046244c38abe83a740e904f2f25b1eb66efdb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Wed, 11 Feb 2026 19:52:49 +0100 Subject: [PATCH 14/37] - bug fix --- .../osmo/deploy/002-setup/03-deploy-nginx-ingress.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh b/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh index 4da7c82a6..807a0c3a5 100755 --- a/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh +++ b/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh @@ -43,9 +43,14 @@ log_info "Creating namespace ${INGRESS_NAMESPACE}..." kubectl create namespace "${INGRESS_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - log_info "Installing NGINX Ingress Controller..." +# --set controller.progressDeadlineSeconds=600: chart v4.14+ defaults to 0 which +# K8s 1.32+ rejects ("must be greater than minReadySeconds"). Without this fix the +# Deployment is invalid, the controller never starts, and the admission webhook +# blocks all Ingress resource creation in downstream scripts. helm upgrade --install "${INGRESS_RELEASE_NAME}" ingress-nginx/ingress-nginx \ --namespace "${INGRESS_NAMESPACE}" \ --set controller.service.type=LoadBalancer \ + --set controller.progressDeadlineSeconds=600 \ --wait --timeout 5m || { log_warning "Helm install returned non-zero; controller may still be starting." } From 987d2c74a4be04818b4312805bb1d6f40ad48a04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Wed, 11 Feb 2026 20:11:31 +0100 Subject: [PATCH 15/37] - SSL / TSL implementation --- .../002-setup/03-deploy-nginx-ingress.sh | 1 + .../osmo/deploy/002-setup/09-enable-tls.sh | 242 ++++++++++++++++++ .../cleanup/uninstall-nginx-ingress.sh | 9 + 3 files changed, 252 insertions(+) create mode 100755 applications/osmo/deploy/002-setup/09-enable-tls.sh diff --git a/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh b/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh index 807a0c3a5..0043dc89e 100755 --- a/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh +++ b/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh @@ -16,6 +16,7 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}" INGRESS_RELEASE_NAME="${INGRESS_RELEASE_NAME:-ingress-nginx}" diff --git a/applications/osmo/deploy/002-setup/09-enable-tls.sh b/applications/osmo/deploy/002-setup/09-enable-tls.sh new file mode 100755 index 000000000..63cb7b1ce --- /dev/null +++ b/applications/osmo/deploy/002-setup/09-enable-tls.sh @@ -0,0 +1,242 @@ +#!/bin/bash +# +# Enable TLS/HTTPS for OSMO using cert-manager + Let's Encrypt +# +# Prerequisites: +# 1. OSMO is deployed and accessible over HTTP (scripts 01-05) +# 2. A DNS record points your domain to the LoadBalancer IP +# (check with: kubectl get svc -n ingress-nginx ingress-nginx-controller) +# +# Usage: +# ./09-enable-tls.sh +# +# Example: +# ./09-enable-tls.sh vl51.eu-north1.osmo.nebius.cloud +# +# Optional environment variables: +# OSMO_TLS_EMAIL - Email for Let's Encrypt expiry notices (default: noreply@) +# OSMO_TLS_SECRET_NAME - K8s Secret name for certificate (default: osmo-tls) +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +HOSTNAME="${1:-${OSMO_INGRESS_HOSTNAME:-}}" +TLS_SECRET="${OSMO_TLS_SECRET_NAME:-osmo-tls}" +OSMO_NS="${OSMO_NAMESPACE:-osmo}" +INGRESS_NS="${INGRESS_NAMESPACE:-ingress-nginx}" + +echo "" +echo "========================================" +echo " Enable TLS/HTTPS for OSMO" +echo "========================================" +echo "" + +# ----------------------------------------------------------------------------- +# Validate inputs +# ----------------------------------------------------------------------------- +if [[ -z "$HOSTNAME" ]]; then + log_error "Usage: $0 " + echo "" + echo "Example: $0 vl51.eu-north1.osmo.nebius.cloud" + echo "" + LB_IP=$(kubectl get svc -n "${INGRESS_NS}" ingress-nginx-controller \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + if [[ -n "$LB_IP" ]]; then + echo "Your LoadBalancer IP is: ${LB_IP}" + echo "Create a DNS A record pointing your domain to this IP, then re-run this script." + fi + exit 1 +fi + +check_kubectl || exit 1 +check_helm || exit 1 + +log_info "Hostname: ${HOSTNAME}" +log_info "TLS secret: ${TLS_SECRET}" + +# Verify DNS resolves to the LoadBalancer IP +LB_IP=$(kubectl get svc -n "${INGRESS_NS}" ingress-nginx-controller \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) +DNS_IP=$(dig +short "$HOSTNAME" 2>/dev/null | tail -1 || true) + +if [[ -n "$LB_IP" && -n "$DNS_IP" ]]; then + if [[ "$DNS_IP" == "$LB_IP" ]]; then + log_success "DNS check: ${HOSTNAME} -> ${DNS_IP} (matches LoadBalancer)" + else + log_warning "DNS mismatch: ${HOSTNAME} -> ${DNS_IP}, but LoadBalancer IP is ${LB_IP}" + log_warning "Let's Encrypt HTTP-01 challenge may fail if DNS doesn't point to the LoadBalancer." + fi +elif [[ -z "$DNS_IP" ]]; then + log_warning "Could not resolve ${HOSTNAME}. Make sure the DNS record exists." +fi + +# Verify Ingress resources exist +INGRESS_COUNT=$(kubectl get ingress -n "${OSMO_NS}" --no-headers 2>/dev/null | wc -l | tr -d ' ') +if [[ "$INGRESS_COUNT" -eq 0 ]]; then + log_error "No Ingress resources found in namespace ${OSMO_NS}." + log_error "Run 04-deploy-osmo-control-plane.sh first." + exit 1 +fi +log_info "Found ${INGRESS_COUNT} Ingress resource(s) in ${OSMO_NS}" + +# ----------------------------------------------------------------------------- +# Step 1: Install cert-manager +# ----------------------------------------------------------------------------- +log_info "Installing cert-manager..." +helm repo add jetstack https://charts.jetstack.io --force-update +helm repo update jetstack + +if helm status cert-manager -n cert-manager &>/dev/null; then + log_info "cert-manager already installed" +else + helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager --create-namespace \ + --set crds.enabled=true \ + --wait --timeout 5m +fi +log_success "cert-manager ready" + +# ----------------------------------------------------------------------------- +# Step 2: Create Let's Encrypt ClusterIssuer +# ----------------------------------------------------------------------------- +TLS_EMAIL="${OSMO_TLS_EMAIL:-noreply@${HOSTNAME#*.}}" +log_info "Creating Let's Encrypt ClusterIssuer (email: ${TLS_EMAIL})..." + +kubectl apply -f - </dev/null); do + ing_name="${ing#*/}" + # Get current HTTP paths from this ingress + CURRENT_HTTP=$(kubectl get "$ing" -n "${OSMO_NS}" -o jsonpath='{.spec.rules[0].http}') + + kubectl patch "$ing" -n "${OSMO_NS}" --type=merge -p "$(cat </dev/null || echo "") + if [[ "$CERT_READY" == "True" ]]; then + log_success "TLS certificate issued and ready" + break + fi + sleep 5 +done + +if [[ "$CERT_READY" != "True" ]]; then + log_warning "Certificate not ready yet. Checking status..." + kubectl describe certificate "${TLS_SECRET}" -n "${OSMO_NS}" 2>/dev/null | tail -10 + echo "" + log_info "It may take a few more minutes. Check with:" + echo " kubectl get certificate -n ${OSMO_NS}" + echo " kubectl describe challenge -n ${OSMO_NS}" +fi + +# ----------------------------------------------------------------------------- +# Step 5: Update OSMO service_base_url to HTTPS +# ----------------------------------------------------------------------------- +log_info "Updating OSMO service_base_url to https://${HOSTNAME}..." + +kubectl port-forward -n "${OSMO_NS}" svc/osmo-service 8080:80 &>/dev/null & +_PF_PID=$! +trap 'kill $_PF_PID 2>/dev/null; wait $_PF_PID 2>/dev/null' EXIT + +# Wait for port-forward +_pf_ready=false +for i in $(seq 1 15); do + if curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; then + _pf_ready=true + break + fi + sleep 1 +done + +if [[ "$_pf_ready" == "true" ]]; then + # Login + if osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then + cat > /tmp/service_url_tls.json </dev/null; then + NEW_URL=$(curl -s "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') + log_success "service_base_url updated to: ${NEW_URL}" + else + log_warning "Could not update service_base_url automatically." + log_info "Run: ./07-configure-service-url.sh https://${HOSTNAME}" + fi + rm -f /tmp/service_url_tls.json + else + log_warning "Could not login to OSMO API. Update service_base_url manually:" + log_info " ./07-configure-service-url.sh https://${HOSTNAME}" + fi +else + log_warning "Could not connect to OSMO API. Update service_base_url manually:" + log_info " ./07-configure-service-url.sh https://${HOSTNAME}" +fi + +# ----------------------------------------------------------------------------- +# Done +# ----------------------------------------------------------------------------- +echo "" +echo "========================================" +log_success "TLS enabled for OSMO" +echo "========================================" +echo "" +echo "OSMO is now accessible at:" +echo " https://${HOSTNAME}" +echo " https://${HOSTNAME}/api/version" +echo "" +echo "CLI login:" +echo " osmo login https://${HOSTNAME} --method dev --username admin" +echo "" diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-nginx-ingress.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-nginx-ingress.sh index f9f6d43d6..471029d5c 100755 --- a/applications/osmo/deploy/002-setup/cleanup/uninstall-nginx-ingress.sh +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-nginx-ingress.sh @@ -9,3 +9,12 @@ log_info "Uninstalling NGINX Ingress Controller..." helm uninstall "${INGRESS_RELEASE_NAME}" -n "${INGRESS_NAMESPACE}" 2>/dev/null || true kubectl delete namespace "${INGRESS_NAMESPACE}" --ignore-not-found --timeout=60s 2>/dev/null || true log_success "NGINX Ingress Controller uninstalled" + +# Uninstall cert-manager (if installed) +if helm status cert-manager -n cert-manager &>/dev/null; then + log_info "Uninstalling cert-manager..." + kubectl delete clusterissuer letsencrypt --ignore-not-found 2>/dev/null || true + helm uninstall cert-manager -n cert-manager 2>/dev/null || true + kubectl delete namespace cert-manager --ignore-not-found --timeout=60s 2>/dev/null || true + log_success "cert-manager uninstalled" +fi From 97cd498cdb134a6c6a38e08818b55b5edce013c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Thu, 12 Feb 2026 12:26:35 +0100 Subject: [PATCH 16/37] - fix setup order --- applications/osmo/deploy/001-iac/outputs.tf | 7 +- .../002-setup/03-deploy-nginx-ingress.sh | 7 +- .../{09-enable-tls.sh => 04-enable-tls.sh} | 183 ++++++++++++------ ...ane.sh => 05-deploy-osmo-control-plane.sh} | 71 ++++++- ...o-backend.sh => 06-deploy-osmo-backend.sh} | 4 +- ...ure-storage.sh => 07-configure-storage.sh} | 2 +- ...ice-url.sh => 08-configure-service-url.sh} | 0 ...atform.sh => 09-configure-gpu-platform.sh} | 0 applications/osmo/deploy/002-setup/README.md | 38 ++-- applications/osmo/deploy/README.md | 16 +- 10 files changed, 233 insertions(+), 95 deletions(-) rename applications/osmo/deploy/002-setup/{09-enable-tls.sh => 04-enable-tls.sh} (50%) rename applications/osmo/deploy/002-setup/{04-deploy-osmo-control-plane.sh => 05-deploy-osmo-control-plane.sh} (95%) rename applications/osmo/deploy/002-setup/{05-deploy-osmo-backend.sh => 06-deploy-osmo-backend.sh} (99%) rename applications/osmo/deploy/002-setup/{06-configure-storage.sh => 07-configure-storage.sh} (99%) rename applications/osmo/deploy/002-setup/{07-configure-service-url.sh => 08-configure-service-url.sh} (100%) rename applications/osmo/deploy/002-setup/{08-configure-gpu-platform.sh => 09-configure-gpu-platform.sh} (100%) diff --git a/applications/osmo/deploy/001-iac/outputs.tf b/applications/osmo/deploy/001-iac/outputs.tf index cc5f31e91..467feaad0 100755 --- a/applications/osmo/deploy/001-iac/outputs.tf +++ b/applications/osmo/deploy/001-iac/outputs.tf @@ -162,10 +162,11 @@ output "next_steps" { ./01-deploy-gpu-infrastructure.sh ./02-deploy-observability.sh ./03-deploy-nginx-ingress.sh - ./04-deploy-osmo-control-plane.sh - ./05-deploy-osmo-backend.sh + ./04-enable-tls.sh + ./05-deploy-osmo-control-plane.sh + ./06-deploy-osmo-backend.sh - ${var.enable_managed_postgresql ? "PostgreSQL Connection (Managed):\n Host: ${module.platform.postgresql_host}\n Port: ${module.platform.postgresql_port}\n Database: ${module.platform.postgresql_database}\n Username: ${module.platform.postgresql_username}" : "PostgreSQL: Using in-cluster PostgreSQL (deployed via Helm in 04-deploy-osmo-control-plane.sh)"} + ${var.enable_managed_postgresql ? "PostgreSQL Connection (Managed):\n Host: ${module.platform.postgresql_host}\n Port: ${module.platform.postgresql_port}\n Database: ${module.platform.postgresql_database}\n Username: ${module.platform.postgresql_username}" : "PostgreSQL: Using in-cluster PostgreSQL (deployed via Helm in 05-deploy-osmo-control-plane.sh)"} Object Storage: Bucket: ${module.platform.storage_bucket_name} diff --git a/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh b/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh index 0043dc89e..5ecda68d3 100755 --- a/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh +++ b/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh @@ -9,7 +9,7 @@ # We do not use the quick-start umbrella chart here (Nebius uses managed DB, etc.), # so we install the controller explicitly. Not a duplicate of OSMO—same upstream chart. # -# Run before 04-deploy-osmo-control-plane.sh. +# Run before 05-deploy-osmo-control-plane.sh. # See: https://kubernetes.github.io/ingress-nginx/deploy/ set -e @@ -70,7 +70,7 @@ for i in $(seq 1 24); do echo "OSMO will be accessible at:" echo " http://${LB_IP}" echo "" - echo "This URL is auto-detected by 04-deploy-osmo-control-plane.sh." + echo "This URL is auto-detected by 05-deploy-osmo-control-plane.sh." echo "" break fi @@ -84,5 +84,6 @@ echo "========================================" log_success "NGINX Ingress deployment complete" echo "========================================" echo "" -echo "Next: run 04-deploy-osmo-control-plane.sh" +echo "Next: run 04-enable-tls.sh (optional, recommended)" +echo " then 05-deploy-osmo-control-plane.sh" echo "" diff --git a/applications/osmo/deploy/002-setup/09-enable-tls.sh b/applications/osmo/deploy/002-setup/04-enable-tls.sh similarity index 50% rename from applications/osmo/deploy/002-setup/09-enable-tls.sh rename to applications/osmo/deploy/002-setup/04-enable-tls.sh index 63cb7b1ce..90eceab3b 100755 --- a/applications/osmo/deploy/002-setup/09-enable-tls.sh +++ b/applications/osmo/deploy/002-setup/04-enable-tls.sh @@ -1,17 +1,27 @@ #!/bin/bash # -# Enable TLS/HTTPS for OSMO using cert-manager + Let's Encrypt +# Enable TLS/HTTPS using cert-manager + Let's Encrypt +# +# Can be run at two points in the deployment flow: +# +# A) Right after 03-deploy-nginx-ingress.sh (RECOMMENDED): +# Installs cert-manager, issues the TLS certificate early. +# When 05-deploy-osmo-control-plane.sh runs later, it auto-detects the +# certificate and creates TLS-enabled Ingress resources from the start. +# +# B) After 05-deploy-osmo-control-plane.sh (retrofit existing deployment): +# Does everything in (A) plus patches existing OSMO Ingress resources +# and updates service_base_url to HTTPS. # # Prerequisites: -# 1. OSMO is deployed and accessible over HTTP (scripts 01-05) -# 2. A DNS record points your domain to the LoadBalancer IP -# (check with: kubectl get svc -n ingress-nginx ingress-nginx-controller) +# 1. NGINX Ingress Controller deployed (03-deploy-nginx-ingress.sh) +# 2. A DNS A record pointing your domain to the LoadBalancer IP # # Usage: -# ./09-enable-tls.sh +# ./04-enable-tls.sh # # Example: -# ./09-enable-tls.sh vl51.eu-north1.osmo.nebius.cloud +# ./04-enable-tls.sh vl51.eu-north1.osmo.nebius.cloud # # Optional environment variables: # OSMO_TLS_EMAIL - Email for Let's Encrypt expiry notices (default: noreply@) @@ -31,7 +41,7 @@ INGRESS_NS="${INGRESS_NAMESPACE:-ingress-nginx}" echo "" echo "========================================" -echo " Enable TLS/HTTPS for OSMO" +echo " Enable TLS/HTTPS" echo "========================================" echo "" @@ -74,14 +84,16 @@ elif [[ -z "$DNS_IP" ]]; then log_warning "Could not resolve ${HOSTNAME}. Make sure the DNS record exists." fi -# Verify Ingress resources exist +# Check if OSMO is already deployed (determines whether to patch Ingress / update config) INGRESS_COUNT=$(kubectl get ingress -n "${OSMO_NS}" --no-headers 2>/dev/null | wc -l | tr -d ' ') -if [[ "$INGRESS_COUNT" -eq 0 ]]; then - log_error "No Ingress resources found in namespace ${OSMO_NS}." - log_error "Run 04-deploy-osmo-control-plane.sh first." - exit 1 +if [[ "$INGRESS_COUNT" -gt 0 ]]; then + log_info "Found ${INGRESS_COUNT} Ingress resource(s) in ${OSMO_NS} (will patch with TLS)" + OSMO_DEPLOYED="true" +else + log_info "No OSMO Ingress resources yet — preparing cert-manager and certificate" + log_info "Step 05 will auto-detect the TLS cert and create HTTPS Ingress" + OSMO_DEPLOYED="false" fi -log_info "Found ${INGRESS_COUNT} Ingress resource(s) in ${OSMO_NS}" # ----------------------------------------------------------------------------- # Step 1: Install cert-manager @@ -125,16 +137,21 @@ EOF log_success "ClusterIssuer created" # ----------------------------------------------------------------------------- -# Step 3: Patch all Ingress resources with TLS +# Step 3: Issue TLS certificate # ----------------------------------------------------------------------------- -log_info "Patching Ingress resources for TLS..." -for ing in $(kubectl get ingress -n "${OSMO_NS}" -o name 2>/dev/null); do - ing_name="${ing#*/}" - # Get current HTTP paths from this ingress - CURRENT_HTTP=$(kubectl get "$ing" -n "${OSMO_NS}" -o jsonpath='{.spec.rules[0].http}') +# Ensure the OSMO namespace exists (needed for Certificate resource) +kubectl create namespace "${OSMO_NS}" --dry-run=client -o yaml | kubectl apply -f - + +if [[ "$OSMO_DEPLOYED" == "true" ]]; then + # Mode B: Patch existing Ingress resources with TLS + log_info "Patching Ingress resources for TLS..." + + for ing in $(kubectl get ingress -n "${OSMO_NS}" -o name 2>/dev/null); do + ing_name="${ing#*/}" + CURRENT_HTTP=$(kubectl get "$ing" -n "${OSMO_NS}" -o jsonpath='{.spec.rules[0].http}') - kubectl patch "$ing" -n "${OSMO_NS}" --type=merge -p "$(cat </dev/null); do } PATCH )" && log_success " ${ing_name} patched" || log_warning " Failed to patch ${ing_name}" -done + done +else + # Mode A: Create a temporary Ingress to trigger HTTP-01 challenge + # cert-manager needs an Ingress with the annotation to issue the cert + log_info "Creating temporary Ingress for certificate issuance..." + kubectl apply -f - </dev/null & -_PF_PID=$! -trap 'kill $_PF_PID 2>/dev/null; wait $_PF_PID 2>/dev/null' EXIT + kubectl port-forward -n "${OSMO_NS}" svc/osmo-service 8080:80 &>/dev/null & + _PF_PID=$! + trap 'kill $_PF_PID 2>/dev/null; wait $_PF_PID 2>/dev/null' EXIT -# Wait for port-forward -_pf_ready=false -for i in $(seq 1 15); do - if curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; then - _pf_ready=true - break - fi - sleep 1 -done + # Wait for port-forward + _pf_ready=false + for i in $(seq 1 15); do + if curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; then + _pf_ready=true + break + fi + sleep 1 + done -if [[ "$_pf_ready" == "true" ]]; then - # Login - if osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then - cat > /tmp/service_url_tls.json </dev/null; then + cat > /tmp/service_url_tls.json </dev/null; then - NEW_URL=$(curl -s "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') - log_success "service_base_url updated to: ${NEW_URL}" + if osmo config update SERVICE --file /tmp/service_url_tls.json --description "Enable HTTPS" 2>/dev/null; then + NEW_URL=$(curl -s "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') + log_success "service_base_url updated to: ${NEW_URL}" + else + log_warning "Could not update service_base_url automatically." + log_info "Run: ./08-configure-service-url.sh https://${HOSTNAME}" + fi + rm -f /tmp/service_url_tls.json else - log_warning "Could not update service_base_url automatically." - log_info "Run: ./07-configure-service-url.sh https://${HOSTNAME}" + log_warning "Could not login to OSMO API. Update service_base_url manually:" + log_info " ./08-configure-service-url.sh https://${HOSTNAME}" fi - rm -f /tmp/service_url_tls.json else - log_warning "Could not login to OSMO API. Update service_base_url manually:" - log_info " ./07-configure-service-url.sh https://${HOSTNAME}" + log_warning "Could not connect to OSMO API. Update service_base_url manually:" + log_info " ./08-configure-service-url.sh https://${HOSTNAME}" fi else - log_warning "Could not connect to OSMO API. Update service_base_url manually:" - log_info " ./07-configure-service-url.sh https://${HOSTNAME}" + log_info "Skipping service_base_url update (OSMO not deployed yet)" + log_info "Step 05 will auto-detect TLS and use https:// for service_base_url" +fi + +# ----------------------------------------------------------------------------- +# Step 6: Clean up bootstrap Ingress (if OSMO was deployed after cert issued) +# ----------------------------------------------------------------------------- +if [[ "$OSMO_DEPLOYED" == "true" ]]; then + # Remove the bootstrap ingress if it exists (from a previous Mode A run) + kubectl delete ingress osmo-tls-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null fi # ----------------------------------------------------------------------------- @@ -230,13 +291,23 @@ fi # ----------------------------------------------------------------------------- echo "" echo "========================================" -log_success "TLS enabled for OSMO" +log_success "TLS setup complete" echo "========================================" echo "" -echo "OSMO is now accessible at:" -echo " https://${HOSTNAME}" -echo " https://${HOSTNAME}/api/version" -echo "" -echo "CLI login:" -echo " osmo login https://${HOSTNAME} --method dev --username admin" + +if [[ "$OSMO_DEPLOYED" == "true" ]]; then + echo "OSMO is now accessible at:" + echo " https://${HOSTNAME}" + echo " https://${HOSTNAME}/api/version" + echo "" + echo "CLI login:" + echo " osmo login https://${HOSTNAME} --method dev --username admin" +else + echo "TLS certificate prepared for: ${HOSTNAME}" + echo "" + echo "Next steps:" + echo " 1. Wait for certificate to be ready: kubectl get certificate -n ${OSMO_NS}" + echo " 2. Deploy OSMO: ./05-deploy-osmo-control-plane.sh" + echo " (It will auto-detect the TLS cert and create HTTPS Ingress)" +fi echo "" diff --git a/applications/osmo/deploy/002-setup/04-deploy-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh similarity index 95% rename from applications/osmo/deploy/002-setup/04-deploy-osmo-control-plane.sh rename to applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh index 266db7016..9ef8d5d7b 100755 --- a/applications/osmo/deploy/002-setup/04-deploy-osmo-control-plane.sh +++ b/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh @@ -859,6 +859,17 @@ else log_info "Ingress hostname: (any — IP-based access)" fi +# Auto-detect TLS certificate (created by 04-enable-tls.sh run before this script) +TLS_SECRET="${OSMO_TLS_SECRET_NAME:-osmo-tls}" +TLS_ENABLED="false" +if kubectl get secret "${TLS_SECRET}" -n "${OSMO_NAMESPACE}" &>/dev/null; then + log_success "TLS certificate detected (${TLS_SECRET}) — will create HTTPS Ingress" + TLS_ENABLED="true" +elif kubectl get certificate "${TLS_SECRET}" -n "${OSMO_NAMESPACE}" &>/dev/null; then + log_info "TLS certificate pending (${TLS_SECRET}) — will create HTTPS Ingress" + TLS_ENABLED="true" +fi + # Create the values file with proper extraEnv and extraVolumes for each service # This configures PostgreSQL password via env var and MEK via volume mount cat > /tmp/osmo_values.yaml </dev/null); do + ing_name="${ing#*/}" + [[ "$ing_name" == "osmo-tls-bootstrap" ]] && continue # skip bootstrap ingress + CURRENT_HTTP=$(kubectl get "$ing" -n "${OSMO_NAMESPACE}" -o jsonpath='{.spec.rules[0].http}') + + kubectl patch "$ing" -n "${OSMO_NAMESPACE}" --type=merge -p "$(cat </dev/null + log_success "Ingress TLS patching complete" +fi + # ----------------------------------------------------------------------------- # Step 9: Patch Deployments to Add vault-secrets Volume # ----------------------------------------------------------------------------- @@ -1302,12 +1350,15 @@ INGRESS_URL=$(detect_service_url 2>/dev/null || true) if [[ -n "${OSMO_INGRESS_BASE_URL:-}" ]]; then TARGET_SERVICE_URL="${OSMO_INGRESS_BASE_URL}" log_info "Using explicit Ingress base URL: ${TARGET_SERVICE_URL}" +elif [[ "$TLS_ENABLED" == "true" && -n "$INGRESS_HOSTNAME" ]]; then + TARGET_SERVICE_URL="https://${INGRESS_HOSTNAME}" + log_info "TLS detected, using HTTPS: ${TARGET_SERVICE_URL}" elif [[ -n "$INGRESS_URL" ]]; then TARGET_SERVICE_URL="${INGRESS_URL}" log_info "Auto-detected service URL: ${TARGET_SERVICE_URL}" else log_warning "Could not detect Ingress URL. Skipping service_base_url configuration." - log_warning "Run ./07-configure-service-url.sh manually after verifying the Ingress." + log_warning "Run ./08-configure-service-url.sh manually after verifying the Ingress." TARGET_SERVICE_URL="" fi @@ -1359,18 +1410,18 @@ SVCEOF if [[ "$NEW_SVC_URL" == "$TARGET_SERVICE_URL" ]]; then log_success "service_base_url configured: ${NEW_SVC_URL}" else - log_warning "service_base_url verification failed. Run ./07-configure-service-url.sh manually." + log_warning "service_base_url verification failed. Run ./08-configure-service-url.sh manually." fi else - log_warning "Failed to set service_base_url. Run ./07-configure-service-url.sh manually." + log_warning "Failed to set service_base_url. Run ./08-configure-service-url.sh manually." fi rm -f /tmp/service_url_fix.json fi else - log_warning "Could not login to OSMO. Run ./07-configure-service-url.sh manually." + log_warning "Could not login to OSMO. Run ./08-configure-service-url.sh manually." fi else - log_warning "Port-forward not ready. Run ./07-configure-service-url.sh manually." + log_warning "Port-forward not ready. Run ./08-configure-service-url.sh manually." fi _cleanup_pf @@ -1382,7 +1433,13 @@ log_success "OSMO Control Plane deployment complete!" echo "========================================" echo "" -if [[ -n "$INGRESS_URL" ]]; then +if [[ "$TLS_ENABLED" == "true" && -n "$INGRESS_HOSTNAME" ]]; then + echo "OSMO Access (HTTPS via NGINX Ingress + cert-manager):" + echo " OSMO API: https://${INGRESS_HOSTNAME}/api/version" + echo " OSMO UI: https://${INGRESS_HOSTNAME}" + echo " OSMO CLI: osmo login https://${INGRESS_HOSTNAME} --method dev --username admin" + echo "" +elif [[ -n "$INGRESS_URL" ]]; then echo "OSMO Access (via NGINX Ingress LoadBalancer):" echo " OSMO API: ${INGRESS_URL}/api/version" echo " OSMO UI: ${INGRESS_URL}" @@ -1422,5 +1479,5 @@ echo "Ingress resources:" kubectl get ingress -n "${OSMO_NAMESPACE}" 2>/dev/null || true echo "" echo "Next step - Deploy Backend Operator:" -echo " ./05-deploy-osmo-backend.sh" +echo " ./06-deploy-osmo-backend.sh" echo "" diff --git a/applications/osmo/deploy/002-setup/05-deploy-osmo-backend.sh b/applications/osmo/deploy/002-setup/06-deploy-osmo-backend.sh similarity index 99% rename from applications/osmo/deploy/002-setup/05-deploy-osmo-backend.sh rename to applications/osmo/deploy/002-setup/06-deploy-osmo-backend.sh index 978d2bfed..0b5b495d7 100755 --- a/applications/osmo/deploy/002-setup/05-deploy-osmo-backend.sh +++ b/applications/osmo/deploy/002-setup/06-deploy-osmo-backend.sh @@ -50,7 +50,7 @@ if [[ -z "${OSMO_SERVICE_URL:-}" ]]; then log_success "In-cluster Agent URL: ${OSMO_SERVICE_URL}" else echo "" - log_error "Could not detect OSMO Agent service. Deploy OSMO first: ./04-deploy-osmo-control-plane.sh" + log_error "Could not detect OSMO Agent service. Deploy OSMO first: ./05-deploy-osmo-control-plane.sh" log_error "Note: Backend operators require osmo-agent service for WebSocket connections" exit 1 fi @@ -299,5 +299,5 @@ else fi echo "" echo "Next step - Configure Storage:" -echo " ./06-configure-storage.sh" +echo " ./07-configure-storage.sh" echo "" diff --git a/applications/osmo/deploy/002-setup/06-configure-storage.sh b/applications/osmo/deploy/002-setup/07-configure-storage.sh similarity index 99% rename from applications/osmo/deploy/002-setup/06-configure-storage.sh rename to applications/osmo/deploy/002-setup/07-configure-storage.sh index 9b455a281..61b31b123 100755 --- a/applications/osmo/deploy/002-setup/06-configure-storage.sh +++ b/applications/osmo/deploy/002-setup/07-configure-storage.sh @@ -64,7 +64,7 @@ if ! kubectl get secret osmo-storage -n osmo &>/dev/null; then if [[ -z "$S3_ACCESS_KEY" || -z "$S3_SECRET_KEY" ]]; then log_error "Could not retrieve storage credentials" echo "" - echo "Either re-run 04-deploy-osmo-control-plane.sh or create the secret manually:" + echo "Either re-run 05-deploy-osmo-control-plane.sh or create the secret manually:" echo "" echo " kubectl create secret generic osmo-storage \\" echo " --namespace osmo \\" diff --git a/applications/osmo/deploy/002-setup/07-configure-service-url.sh b/applications/osmo/deploy/002-setup/08-configure-service-url.sh similarity index 100% rename from applications/osmo/deploy/002-setup/07-configure-service-url.sh rename to applications/osmo/deploy/002-setup/08-configure-service-url.sh diff --git a/applications/osmo/deploy/002-setup/08-configure-gpu-platform.sh b/applications/osmo/deploy/002-setup/09-configure-gpu-platform.sh similarity index 100% rename from applications/osmo/deploy/002-setup/08-configure-gpu-platform.sh rename to applications/osmo/deploy/002-setup/09-configure-gpu-platform.sh diff --git a/applications/osmo/deploy/002-setup/README.md b/applications/osmo/deploy/002-setup/README.md index 05ec8b55c..0cde10d5a 100755 --- a/applications/osmo/deploy/002-setup/README.md +++ b/applications/osmo/deploy/002-setup/README.md @@ -24,17 +24,20 @@ Run scripts in order: # 3. NGINX Ingress Controller (required – provides routing for OSMO services) ./03-deploy-nginx-ingress.sh -# 4. OSMO Control Plane -./04-deploy-osmo-control-plane.sh +# 4. Enable TLS (optional, recommended – set up DNS A record first) +./04-enable-tls.sh -# 5. OSMO Backend -./05-deploy-osmo-backend.sh +# 5. OSMO Control Plane +./05-deploy-osmo-control-plane.sh -# 6. Configure Storage (requires port-forward, see main README) -./06-configure-storage.sh +# 6. OSMO Backend +./06-deploy-osmo-backend.sh -# 7. Configure GPU Platform (required for GPU workflows) -./08-configure-gpu-platform.sh +# 7. Configure Storage (requires port-forward, see main README) +./07-configure-storage.sh + +# 8. Configure GPU Platform (required for GPU workflows) +./09-configure-gpu-platform.sh ``` ## Scripts @@ -44,11 +47,12 @@ Run scripts in order: | `01-deploy-gpu-infrastructure.sh` | GPU Operator, Network Operator, KAI Scheduler | ~15 min | | `02-deploy-observability.sh` | Prometheus, Grafana, Loki, Promtail | ~10 min | | `03-deploy-nginx-ingress.sh` | NGINX Ingress Controller (routing for OSMO services) | ~2 min | -| `04-deploy-osmo-control-plane.sh` | OSMO Control Plane, Ingress resources, database secrets, service URL | ~5 min | -| `05-deploy-osmo-backend.sh` | OSMO Backend operator | ~5 min | -| `06-configure-storage.sh` | Configure S3-compatible storage for workflow logs/data | ~1 min | -| `07-configure-service-url.sh` | Reconfigure service URL manually (usually not needed) | ~1 min | -| `08-configure-gpu-platform.sh` | Configure GPU platform with tolerations/node selector | ~1 min | +| `04-enable-tls.sh` | TLS/HTTPS via cert-manager + Let's Encrypt (optional, recommended) | ~2 min | +| `05-deploy-osmo-control-plane.sh` | OSMO Control Plane, Ingress resources, database secrets, service URL | ~5 min | +| `06-deploy-osmo-backend.sh` | OSMO Backend operator | ~5 min | +| `07-configure-storage.sh` | Configure S3-compatible storage for workflow logs/data | ~1 min | +| `08-configure-service-url.sh` | Reconfigure service URL manually (usually not needed) | ~1 min | +| `09-configure-gpu-platform.sh` | Configure GPU platform with tolerations/node selector | ~1 min | ## Configuration @@ -79,7 +83,7 @@ OSMO_NAMESPACE="osmo" # Grafana password (auto-generated if empty) GRAFANA_ADMIN_PASSWORD="" -# NGINX Ingress (deploy 03-deploy-nginx-ingress.sh before 04-deploy-osmo-control-plane.sh) +# NGINX Ingress (deploy 03-deploy-nginx-ingress.sh before 05-deploy-osmo-control-plane.sh) OSMO_INGRESS_HOSTNAME="" # hostname for Ingress rules (e.g. osmo.example.com); leave empty for IP-based access OSMO_INGRESS_BASE_URL="" # override for service_base_url; auto-detected from LoadBalancer if empty ``` @@ -93,7 +97,7 @@ If you ran `secrets-init.sh` in the prerequisites step, the following environmen | `TF_VAR_postgresql_mysterybox_secret_id` | MysteryBox secret ID for PostgreSQL password | | `TF_VAR_mek_mysterybox_secret_id` | MysteryBox secret ID for MEK (Master Encryption Key) | -The `04-deploy-osmo-control-plane.sh` script automatically reads these secrets from MysteryBox. This keeps sensitive credentials out of Terraform state and provides a secure secrets management workflow. +The `05-deploy-osmo-control-plane.sh` script automatically reads these secrets from MysteryBox. This keeps sensitive credentials out of Terraform state and provides a secure secrets management workflow. **Secret retrieval order:** 1. **MysteryBox** (if secret ID is set via `TF_VAR_*` or `OSMO_*` env vars) @@ -178,7 +182,7 @@ Nebius GPU nodes have a taint `nvidia.com/gpu=true:NoSchedule` that prevents pod ### Option 1: Run the Configuration Script (Recommended) ```bash -./08-configure-gpu-platform.sh +./09-configure-gpu-platform.sh ``` ### Option 2: Manual Configuration via API @@ -354,7 +358,7 @@ If OSMO shows 0 GPUs or GPU workflows fail to schedule: 4. If missing, run the GPU configuration: ```bash - ./08-configure-gpu-platform.sh + ./09-configure-gpu-platform.sh ``` 5. Verify OSMO sees GPU resources: diff --git a/applications/osmo/deploy/README.md b/applications/osmo/deploy/README.md index 6264fc2f0..aeacc9336 100755 --- a/applications/osmo/deploy/README.md +++ b/applications/osmo/deploy/README.md @@ -75,11 +75,14 @@ cd 002-setup # 3. Deploy NGINX Ingress Controller ./03-deploy-nginx-ingress.sh -# 4. Deploy OSMO control plane -./04-deploy-osmo-control-plane.sh +# 4. Enable TLS (optional, recommended – set up DNS A record first) +./04-enable-tls.sh -# 5. Deploy OSMO backend -./05-deploy-osmo-backend.sh +# 5. Deploy OSMO control plane +./05-deploy-osmo-control-plane.sh + +# 6. Deploy OSMO backend +./06-deploy-osmo-backend.sh ``` ## Directory Structure @@ -110,8 +113,9 @@ deploy/ ├── 01-deploy-gpu-infrastructure.sh ├── 02-deploy-observability.sh ├── 03-deploy-nginx-ingress.sh - ├── 04-deploy-osmo-control-plane.sh - ├── 05-deploy-osmo-backend.sh + ├── 04-enable-tls.sh + ├── 05-deploy-osmo-control-plane.sh + ├── 06-deploy-osmo-backend.sh ├── cleanup/ # Uninstall scripts └── README.md ``` From c939d060fbf54a92256615b0615474c1e124e989 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Thu, 12 Feb 2026 16:04:51 +0100 Subject: [PATCH 17/37] - add keycloak --- .../osmo/deploy/002-setup/04-enable-tls.sh | 147 +- .../002-setup/05-deploy-osmo-control-plane.sh | 1036 +++++-- .../002-setup/06-deploy-osmo-backend.sh | 102 +- .../deploy/002-setup/07-configure-storage.sh | 22 +- .../002-setup/08-configure-service-url.sh | 15 +- .../002-setup/cleanup/uninstall-keycloak.sh | 62 + .../osmo/deploy/002-setup/defaults.sh | 9 + .../osmo/deploy/002-setup/lib/common.sh | 118 +- .../deploy/002-setup/sample_osmo_realm.json | 2636 +++++++++++++++++ 9 files changed, 3786 insertions(+), 361 deletions(-) create mode 100755 applications/osmo/deploy/002-setup/cleanup/uninstall-keycloak.sh create mode 100644 applications/osmo/deploy/002-setup/sample_osmo_realm.json diff --git a/applications/osmo/deploy/002-setup/04-enable-tls.sh b/applications/osmo/deploy/002-setup/04-enable-tls.sh index 90eceab3b..22c8138a5 100755 --- a/applications/osmo/deploy/002-setup/04-enable-tls.sh +++ b/applications/osmo/deploy/002-setup/04-enable-tls.sh @@ -68,20 +68,79 @@ check_helm || exit 1 log_info "Hostname: ${HOSTNAME}" log_info "TLS secret: ${TLS_SECRET}" -# Verify DNS resolves to the LoadBalancer IP +# Keycloak auth subdomain support +DEPLOY_KEYCLOAK="${DEPLOY_KEYCLOAK:-false}" +KC_TLS_SECRET="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" +AUTH_HOSTNAME="" +if [[ "$DEPLOY_KEYCLOAK" == "true" ]]; then + if [[ -n "${KEYCLOAK_HOSTNAME:-}" ]]; then + AUTH_HOSTNAME="${KEYCLOAK_HOSTNAME}" + else + AUTH_HOSTNAME="auth.${HOSTNAME}" + fi + log_info "Keycloak auth hostname: ${AUTH_HOSTNAME}" + log_info "Keycloak TLS secret: ${KC_TLS_SECRET}" +fi + +# Get LoadBalancer IP LB_IP=$(kubectl get svc -n "${INGRESS_NS}" ingress-nginx-controller \ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) -DNS_IP=$(dig +short "$HOSTNAME" 2>/dev/null | tail -1 || true) -if [[ -n "$LB_IP" && -n "$DNS_IP" ]]; then - if [[ "$DNS_IP" == "$LB_IP" ]]; then - log_success "DNS check: ${HOSTNAME} -> ${DNS_IP} (matches LoadBalancer)" - else - log_warning "DNS mismatch: ${HOSTNAME} -> ${DNS_IP}, but LoadBalancer IP is ${LB_IP}" - log_warning "Let's Encrypt HTTP-01 challenge may fail if DNS doesn't point to the LoadBalancer." +# Prompt user to set up DNS records before proceeding +echo "" +echo "========================================" +echo " DNS Record Setup Required" +echo "========================================" +echo "" +if [[ -n "$LB_IP" ]]; then + echo "Create the following DNS A record(s) pointing to your LoadBalancer IP:" + echo "" + echo " ${HOSTNAME} -> ${LB_IP}" + if [[ -n "$AUTH_HOSTNAME" ]]; then + echo " ${AUTH_HOSTNAME} -> ${LB_IP}" + fi +else + echo "LoadBalancer IP not yet assigned. Check with:" + echo " kubectl get svc -n ${INGRESS_NS} ingress-nginx-controller" + echo "" + echo "Once the IP is available, create DNS A record(s) for:" + echo " ${HOSTNAME}" + if [[ -n "$AUTH_HOSTNAME" ]]; then + echo " ${AUTH_HOSTNAME}" + fi +fi +echo "" +echo "Let's Encrypt HTTP-01 challenges require DNS to resolve to the LoadBalancer." +echo "" +read_prompt_var "Press Enter once DNS records are configured (or type 'skip' to skip DNS check)" DNS_CONFIRM "" + +# Verify DNS resolves to the LoadBalancer IP +if [[ "$DNS_CONFIRM" != "skip" ]]; then + DNS_IP=$(dig +short "$HOSTNAME" 2>/dev/null | tail -1 || true) + + if [[ -n "$LB_IP" && -n "$DNS_IP" ]]; then + if [[ "$DNS_IP" == "$LB_IP" ]]; then + log_success "DNS check: ${HOSTNAME} -> ${DNS_IP} (matches LoadBalancer)" + else + log_warning "DNS mismatch: ${HOSTNAME} -> ${DNS_IP}, but LoadBalancer IP is ${LB_IP}" + log_warning "Let's Encrypt HTTP-01 challenge may fail if DNS doesn't point to the LoadBalancer." + fi + elif [[ -z "$DNS_IP" ]]; then + log_warning "Could not resolve ${HOSTNAME}. Make sure the DNS record exists." + fi + + if [[ -n "$AUTH_HOSTNAME" ]]; then + AUTH_DNS_IP=$(dig +short "$AUTH_HOSTNAME" 2>/dev/null | tail -1 || true) + if [[ -n "$LB_IP" && -n "$AUTH_DNS_IP" ]]; then + if [[ "$AUTH_DNS_IP" == "$LB_IP" ]]; then + log_success "DNS check: ${AUTH_HOSTNAME} -> ${AUTH_DNS_IP} (matches LoadBalancer)" + else + log_warning "DNS mismatch: ${AUTH_HOSTNAME} -> ${AUTH_DNS_IP}, but LoadBalancer IP is ${LB_IP}" + fi + elif [[ -z "$AUTH_DNS_IP" ]]; then + log_warning "Could not resolve ${AUTH_HOSTNAME}. Keycloak TLS cert may fail." + fi fi -elif [[ -z "$DNS_IP" ]]; then - log_warning "Could not resolve ${HOSTNAME}. Make sure the DNS record exists." fi # Check if OSMO is already deployed (determines whether to patch Ingress / update config) @@ -230,6 +289,65 @@ if [[ "$CERT_READY" != "True" ]]; then echo " kubectl describe challenge -n ${OSMO_NS}" fi +# ----------------------------------------------------------------------------- +# Step 4b: Issue TLS certificate for Keycloak auth subdomain (if DEPLOY_KEYCLOAK=true) +# ----------------------------------------------------------------------------- +if [[ -n "$AUTH_HOSTNAME" ]]; then + log_info "Issuing TLS certificate for Keycloak auth subdomain: ${AUTH_HOSTNAME}..." + + # Create bootstrap Ingress for auth subdomain (to trigger HTTP-01 challenge) + kubectl apply -f - </dev/null || echo "") + if [[ "$AUTH_CERT_READY" == "True" ]]; then + log_success "Auth TLS certificate issued and ready" + break + fi + sleep 5 + done + + if [[ "$AUTH_CERT_READY" != "True" ]]; then + log_warning "Auth certificate not ready yet. It may take a few more minutes." + log_info "Check with: kubectl get certificate ${KC_TLS_SECRET} -n ${OSMO_NS}" + fi + + # Clean up the bootstrap Ingress if Keycloak will create its own + if [[ "$OSMO_DEPLOYED" == "true" ]]; then + kubectl delete ingress osmo-tls-auth-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null + fi +fi + # ----------------------------------------------------------------------------- # Step 5: Update OSMO service_base_url to HTTPS (only if OSMO is deployed) # ----------------------------------------------------------------------------- @@ -304,10 +422,17 @@ if [[ "$OSMO_DEPLOYED" == "true" ]]; then echo " osmo login https://${HOSTNAME} --method dev --username admin" else echo "TLS certificate prepared for: ${HOSTNAME}" + if [[ -n "$AUTH_HOSTNAME" ]]; then + echo "Auth TLS certificate prepared for: ${AUTH_HOSTNAME}" + fi echo "" echo "Next steps:" - echo " 1. Wait for certificate to be ready: kubectl get certificate -n ${OSMO_NS}" + echo " 1. Wait for certificate(s) to be ready: kubectl get certificate -n ${OSMO_NS}" echo " 2. Deploy OSMO: ./05-deploy-osmo-control-plane.sh" echo " (It will auto-detect the TLS cert and create HTTPS Ingress)" + if [[ -n "$AUTH_HOSTNAME" ]]; then + echo " 3. Deploy with Keycloak: DEPLOY_KEYCLOAK=true ./05-deploy-osmo-control-plane.sh" + echo " (Keycloak will be exposed at https://${AUTH_HOSTNAME})" + fi fi echo "" diff --git a/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh index 9ef8d5d7b..147967a69 100755 --- a/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh +++ b/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh @@ -459,95 +459,86 @@ REDIS_HOST="redis-master.${OSMO_NAMESPACE}.svc.cluster.local" # Required for: osmo login, osmo token, backend operator # Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak +# Auto-detect TLS certificate early (needed for KC_EXTERNAL decision below) +TLS_SECRET="${OSMO_TLS_SECRET_NAME:-osmo-tls}" +TLS_ENABLED="false" +if kubectl get secret "${TLS_SECRET}" -n "${OSMO_NAMESPACE}" &>/dev/null; then + log_info "TLS certificate detected (${TLS_SECRET})" + TLS_ENABLED="true" +elif kubectl get certificate "${TLS_SECRET}" -n "${OSMO_NAMESPACE}" &>/dev/null; then + log_info "TLS certificate pending (${TLS_SECRET})" + TLS_ENABLED="true" +fi + # Keycloak service URL (same namespace as OSMO) KEYCLOAK_HOST="keycloak.${OSMO_NAMESPACE}.svc.cluster.local" KEYCLOAK_URL="http://${KEYCLOAK_HOST}:80" -AUTH_DOMAIN="auth-${OSMO_DOMAIN}" + +# Derive Keycloak external hostname +# Priority: KEYCLOAK_HOSTNAME env var > auto-derive from OSMO_INGRESS_HOSTNAME > OSMO_DOMAIN +if [[ -n "${KEYCLOAK_HOSTNAME:-}" ]]; then + AUTH_DOMAIN="${KEYCLOAK_HOSTNAME}" +elif [[ -n "${OSMO_INGRESS_HOSTNAME:-}" ]]; then + AUTH_DOMAIN="auth.${OSMO_INGRESS_HOSTNAME}" +else + AUTH_DOMAIN="auth.${OSMO_DOMAIN}" +fi +KC_TLS_SECRET="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then log_info "Deploying Keycloak for OSMO authentication..." log_info "Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak" - # ------------------------------------------------------------------------- - # Step 1: Create Keycloak database in PostgreSQL - # Per OSMO docs: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-1-configure-postgresql - # ------------------------------------------------------------------------- - log_info "Creating Keycloak database in PostgreSQL..." - - # Delete old pod if exists - kubectl delete pod osmo-db-ops -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null - - # Use the managed PostgreSQL credentials (bootstrap user has CREATEDB privilege) - cat > /tmp/keycloak-db-init.yaml </dev/null || true - sleep 5 - kubectl wait --for=jsonpath='{.status.phase}'=Succeeded pod/osmo-db-ops -n "${OSMO_NAMESPACE}" --timeout=60s || { - log_warning "Database creation pod status:" - kubectl logs -n "${OSMO_NAMESPACE}" osmo-db-ops || true - } - kubectl delete pod osmo-db-ops -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null - rm -f /tmp/keycloak-db-init.yaml - log_success "Keycloak database ready" + # Keycloak database was already created in Step 2 (osmo-db-init pod) when DEPLOY_KEYCLOAK=true # ------------------------------------------------------------------------- - # Step 2: Create secrets for Keycloak + # Step 4a: Create secrets for Keycloak # ------------------------------------------------------------------------- log_info "Creating Keycloak secrets..." - + # Save admin password to secret for future re-runs kubectl create secret generic keycloak-admin-secret \ --namespace "${OSMO_NAMESPACE}" \ --from-literal=password="${KEYCLOAK_ADMIN_PASSWORD}" \ --dry-run=client -o yaml | kubectl apply -f - - + # Create keycloak-db-secret for external database (per OSMO docs) - # Uses the managed PostgreSQL credentials kubectl create secret generic keycloak-db-secret \ --namespace "${OSMO_NAMESPACE}" \ --from-literal=postgres-password="${POSTGRES_PASSWORD}" \ --dry-run=client -o yaml | kubectl apply -f - - + log_success "Keycloak secrets created" # ------------------------------------------------------------------------- - # Step 3: Install Keycloak using Bitnami Helm chart - # Per OSMO docs: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#install-keycloak-using-bitnami-helm-chart + # Step 4b: Determine if Keycloak should use external TLS ingress + # ------------------------------------------------------------------------- + KC_EXTERNAL="false" + if [[ "$TLS_ENABLED" == "true" && -n "${OSMO_INGRESS_HOSTNAME:-}" ]]; then + # Check TLS secret for auth domain exists + if kubectl get secret "${KC_TLS_SECRET}" -n "${OSMO_NAMESPACE}" &>/dev/null || \ + kubectl get secret "${KC_TLS_SECRET}" -n "${INGRESS_NAMESPACE:-ingress-nginx}" &>/dev/null; then + KC_EXTERNAL="true" + log_info "Keycloak will be exposed externally at: https://${AUTH_DOMAIN}" + elif kubectl get certificate "${KC_TLS_SECRET}" -n "${OSMO_NAMESPACE}" &>/dev/null; then + KC_EXTERNAL="true" + log_info "Keycloak TLS certificate pending — will create external ingress" + else + log_warning "TLS secret '${KC_TLS_SECRET}' for Keycloak not found." + log_warning "Run: DEPLOY_KEYCLOAK=true ./04-enable-tls.sh ${OSMO_INGRESS_HOSTNAME}" + log_warning "Keycloak will be internal-only (port-forward access)" + fi + fi + + # ------------------------------------------------------------------------- + # Step 4c: Install Keycloak using Bitnami Helm chart # ------------------------------------------------------------------------- log_info "Installing Keycloak using Bitnami Helm chart..." - + # Add Bitnami repo helm repo add bitnami https://charts.bitnami.com/bitnami --force-update 2>/dev/null || true helm repo update bitnami - + # Create keycloak-values.yaml per OSMO documentation cat > /tmp/keycloak-values.yaml </dev/null | grep -q keycloak; then @@ -655,26 +713,112 @@ EOF echo " Waiting for Keycloak pod to be created... ($i/30)" sleep 5 done - + # Now wait for it to be ready kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=keycloak \ -n "${OSMO_NAMESPACE}" --timeout=300s || { log_warning "Keycloak pod not ready yet, checking logs..." kubectl logs -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/name=keycloak --tail=30 || true } - + # Additional wait for Keycloak to fully initialize log_info "Waiting for Keycloak to fully initialize..." sleep 30 - - # Configure Keycloak realm and clients for OSMO - # Per documentation: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#post-installation-keycloak-configuration - log_info "Configuring Keycloak realm and clients for OSMO..." - - # Generate client secret + + # ------------------------------------------------------------------------- + # Step 4c.1: Verify admin password works (handle stale DB) + # ------------------------------------------------------------------------- + # KC_BOOTSTRAP_ADMIN_* only creates the admin user on FIRST database init. + # If the keycloak DB already existed (e.g. from a prior deployment with a + # different password), the bootstrap is a no-op and the stored password + # won't match. We detect this and reset the password via SQL. + log_info "Verifying Keycloak admin password..." + + KC_POD=$(kubectl get pods -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/name=keycloak \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + + if [[ -n "$KC_POD" ]]; then + KC_TOKEN_RESP=$(kubectl exec -n "${OSMO_NAMESPACE}" "${KC_POD}" -- \ + curl -s -X POST http://localhost:8080/realms/master/protocol/openid-connect/token \ + -d "client_id=admin-cli" \ + -d "username=admin" \ + -d "password=${KEYCLOAK_ADMIN_PASSWORD}" \ + -d "grant_type=password" 2>/dev/null || echo "") + + if echo "$KC_TOKEN_RESP" | grep -q "access_token"; then + log_success "Keycloak admin password verified" + else + log_warning "Admin password mismatch (stale keycloak DB). Resetting via SQL..." + # Use the db-init credentials to reset the admin password in the keycloak DB + # Keycloak 26.x stores bcrypt hashes. We use the Keycloak KC_SPI approach instead: + # Drop and recreate the keycloak database, then restart Keycloak so bootstrap runs fresh. + kubectl delete pod osmo-kc-db-reset -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null + kubectl run osmo-kc-db-reset \ + --namespace "${OSMO_NAMESPACE}" \ + --image=postgres:16-alpine \ + --restart=Never \ + --env="PGPASSWORD=${POSTGRES_PASSWORD}" \ + --env="PGHOST=${POSTGRES_HOST}" \ + --env="PGPORT=${POSTGRES_PORT}" \ + --env="PGUSER=${POSTGRES_USER}" \ + --env="PGDATABASE=${POSTGRES_DB}" \ + --command -- sh -c ' + echo "Dropping keycloak database..." + psql -c "DROP DATABASE IF EXISTS keycloak;" + echo "Recreating keycloak database..." + psql -c "CREATE DATABASE keycloak;" + echo "Done" + ' >/dev/null 2>&1 + + # Wait for reset pod + for i in $(seq 1 30); do + _rst_status=$(kubectl get pod osmo-kc-db-reset -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.status.phase}' 2>/dev/null || echo "Pending") + [[ "$_rst_status" == "Succeeded" || "$_rst_status" == "Failed" ]] && break + sleep 2 + done + kubectl logs osmo-kc-db-reset -n "${OSMO_NAMESPACE}" 2>/dev/null || true + kubectl delete pod osmo-kc-db-reset -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null + + if [[ "$_rst_status" == "Succeeded" ]]; then + log_info "Keycloak DB reset. Restarting Keycloak pod for fresh bootstrap..." + kubectl delete pod -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/name=keycloak --wait=false + sleep 10 + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=keycloak \ + -n "${OSMO_NAMESPACE}" --timeout=300s || log_warning "Keycloak pod not ready after restart" + log_info "Waiting for Keycloak to fully initialize after restart..." + sleep 20 + log_success "Keycloak restarted with fresh DB (admin password will match)" + else + log_error "Failed to reset keycloak DB. Admin password may not work." + log_error "Manually reset: psql -h ${POSTGRES_HOST} -U ${POSTGRES_USER} -d ${POSTGRES_DB} -c 'DROP DATABASE keycloak; CREATE DATABASE keycloak;'" + fi + fi + fi + + # ------------------------------------------------------------------------- + # Step 4d: Import OSMO realm using official sample_osmo_realm.json + # ------------------------------------------------------------------------- + log_info "Configuring Keycloak realm using official OSMO realm JSON..." + + # Generate client secret for osmo-browser-flow (confidential client) OIDC_CLIENT_SECRET=$(openssl rand -hex 16) - - # Create a job to configure Keycloak + + # Determine OSMO base URL for client redirect URIs + if [[ "$KC_EXTERNAL" == "true" ]]; then + OSMO_BASE_URL="https://${OSMO_INGRESS_HOSTNAME}" + else + OSMO_BASE_URL="http://localhost:8080" + fi + + # Upload the official realm JSON as a ConfigMap (so the job can mount it) + log_info "Creating ConfigMap from sample_osmo_realm.json..." + kubectl create configmap keycloak-realm-json \ + --namespace "${OSMO_NAMESPACE}" \ + --from-file=realm.json="${SCRIPT_DIR}/sample_osmo_realm.json" \ + --dry-run=client -o yaml | kubectl apply -f - + + # Create a job to import the realm and configure a test user cat > /tmp/keycloak-config-job.yaml < /dev/null 2>&1; then + + echo "============================================" + echo " OSMO Keycloak Realm Import" + echo "============================================" + echo "" + + # -- Step 1: Prepare realm JSON -- + echo "=== Step 1: Prepare realm JSON ===" + cp /data/realm.json /tmp/realm-import.json + + # Replace placeholder URLs with actual OSMO URL + sed -i "s|https://default.com|${OSMO_BASE_URL}|g" /tmp/realm-import.json + + # Replace masked client secret with generated secret + sed -i 's/"secret": "[*][*]*"/"secret": "${OIDC_CLIENT_SECRET}"/' /tmp/realm-import.json + + echo " OSMO URL: ${OSMO_BASE_URL}" + echo " Realm JSON: \$(wc -c < /tmp/realm-import.json) bytes" + echo "" + + # -- Step 2: Wait for Keycloak -- + echo "=== Step 2: Wait for Keycloak ===" + # NOTE: Keycloak 26.x serves /health/ready on the management port (9000), + # NOT on the main HTTP port (8080). The K8s service exposes port 80->8080, + # so /health/ready returns 404. Use /realms/master as readiness check instead. + for i in 1 2 3 4 5 6 7 8 9 10 11 12; do + if curl -s -f "\${KEYCLOAK_URL}/realms/master" > /dev/null 2>&1; then echo "Keycloak is ready" break fi - echo "Attempt \$i: Keycloak not ready yet..." + echo " Attempt \$i: Keycloak not ready yet..." sleep 15 done - - echo "Getting admin token..." + echo "" + + # -- Step 3: Get admin token -- + echo "=== Step 3: Get admin token ===" for i in 1 2 3 4 5; do TOKEN=\$(curl -s -X POST "\${KEYCLOAK_URL}/realms/master/protocol/openid-connect/token" \ --data-urlencode "client_id=admin-cli" \ @@ -715,66 +892,73 @@ spec: --data-urlencode "password=${KEYCLOAK_ADMIN_PASSWORD}" \ --data-urlencode "grant_type=password" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4) if [ -n "\$TOKEN" ]; then break; fi - echo "Retry \$i: waiting for token..." + echo " Retry \$i: waiting for token..." sleep 10 done - + if [ -z "\$TOKEN" ]; then - echo "Failed to get admin token" + echo "FATAL: Failed to get admin token" exit 1 fi echo "Got admin token" - - # Create osmo realm (per documentation) - echo "Creating osmo realm..." - curl -s -X POST "\${KEYCLOAK_URL}/admin/realms" \ - -H "Authorization: Bearer \$TOKEN" \ - -H "Content-Type: application/json" \ - -d '{"realm":"osmo","enabled":true,"registrationAllowed":false}' || echo "Realm may already exist" - - # Create osmo-device client (for CLI device code flow) - # Per documentation: public client with device authorization grant - echo "Creating osmo-device client..." - curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/clients" \ - -H "Authorization: Bearer \$TOKEN" \ - -H "Content-Type: application/json" \ - -d '{ - "clientId": "osmo-device", - "name": "OSMO Device Client", - "enabled": true, - "publicClient": true, - "directAccessGrantsEnabled": true, - "standardFlowEnabled": false, - "implicitFlowEnabled": false, - "protocol": "openid-connect", - "attributes": { - "oauth2.device.authorization.grant.enabled": "true" - } - }' || echo "Client may already exist" - - # Create osmo-browser-flow client (for web UI) - # Per documentation: confidential client with standard flow - echo "Creating osmo-browser-flow client..." - curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/clients" \ + echo "" + + # -- Step 4: Import OSMO realm -- + echo "=== Step 4: Import OSMO realm ===" + + # Delete existing realm if present (idempotent re-runs) + REALM_STATUS=\$(curl -s -o /dev/null -w "%{http_code}" "\${KEYCLOAK_URL}/admin/realms/osmo" \ + -H "Authorization: Bearer \$TOKEN") + if [ "\$REALM_STATUS" = "200" ]; then + echo " Existing 'osmo' realm found - deleting for fresh import..." + curl -s -X DELETE "\${KEYCLOAK_URL}/admin/realms/osmo" \ + -H "Authorization: Bearer \$TOKEN" + echo " Old realm deleted" + sleep 5 + fi + + echo "Importing official OSMO realm from sample_osmo_realm.json..." + IMPORT_HTTP=\$(curl -s -o /tmp/import-resp.txt -w "%{http_code}" \ + -X POST "\${KEYCLOAK_URL}/admin/realms" \ -H "Authorization: Bearer \$TOKEN" \ -H "Content-Type: application/json" \ - -d '{ - "clientId": "osmo-browser-flow", - "name": "OSMO Browser Flow Client", - "enabled": true, - "publicClient": false, - "secret": "${OIDC_CLIENT_SECRET}", - "directAccessGrantsEnabled": false, - "standardFlowEnabled": true, - "implicitFlowEnabled": false, - "serviceAccountsEnabled": true, - "protocol": "openid-connect", - "redirectUris": ["*"], - "webOrigins": ["*"] - }' || echo "Client may already exist" - - # Create a test user (per documentation) - echo "Creating osmo-admin user..." + -d @/tmp/realm-import.json) + + if [ "\$IMPORT_HTTP" = "201" ] || [ "\$IMPORT_HTTP" = "204" ]; then + echo "Realm imported successfully (HTTP \$IMPORT_HTTP)" + else + echo "WARNING: Realm import returned HTTP \$IMPORT_HTTP" + cat /tmp/import-resp.txt 2>/dev/null || true + echo "" + echo "Trying partial import as fallback..." + curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/partialImport" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d @/tmp/realm-import.json || echo "Partial import also failed" + fi + + # Verify realm exists + sleep 3 + VERIFY=\$(curl -s -o /dev/null -w "%{http_code}" "\${KEYCLOAK_URL}/admin/realms/osmo" \ + -H "Authorization: Bearer \$TOKEN") + if [ "\$VERIFY" != "200" ]; then + echo "FATAL: Realm 'osmo' not found after import (HTTP \$VERIFY)" + exit 1 + fi + echo "Realm 'osmo' verified" + echo "" + + # -- Step 5: Create test user -- + echo "=== Step 5: Create test user ===" + + # Refresh admin token + TOKEN=\$(curl -s -X POST "\${KEYCLOAK_URL}/realms/master/protocol/openid-connect/token" \ + --data-urlencode "client_id=admin-cli" \ + --data-urlencode "username=admin" \ + --data-urlencode "password=${KEYCLOAK_ADMIN_PASSWORD}" \ + --data-urlencode "grant_type=password" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4) + + echo "Creating osmo-admin test user..." curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/users" \ -H "Authorization: Bearer \$TOKEN" \ -H "Content-Type: application/json" \ @@ -787,61 +971,113 @@ spec: "email": "osmo-admin@example.com", "credentials": [{"type":"password","value":"osmo-admin","temporary":false}] }' || echo "User may already exist" - echo "" + + # -- Step 6: Assign user to Admin group -- + echo "=== Step 6: Assign user to Admin group ===" + + USER_ID=\$(curl -s "\${KEYCLOAK_URL}/admin/realms/osmo/users?username=osmo-admin" \ + -H "Authorization: Bearer \$TOKEN" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4) + + if [ -n "\$USER_ID" ]; then + echo " User ID: \$USER_ID" + + ADMIN_GROUP_ID=\$(curl -s "\${KEYCLOAK_URL}/admin/realms/osmo/groups?search=Admin" \ + -H "Authorization: Bearer \$TOKEN" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4) + + if [ -n "\$ADMIN_GROUP_ID" ]; then + echo " Admin Group ID: \$ADMIN_GROUP_ID" + curl -s -X PUT "\${KEYCLOAK_URL}/admin/realms/osmo/users/\${USER_ID}/groups/\${ADMIN_GROUP_ID}" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d '{}' || echo "Failed to assign group" + echo " User 'osmo-admin' assigned to Admin group (osmo-admin + osmo-user roles)" + else + echo " WARNING: Admin group not found - user roles may need manual assignment" + fi + else + echo " WARNING: Could not find osmo-admin user ID" + fi + echo "" + + # -- Done -- echo "=========================================" - echo "Keycloak OSMO configuration complete!" + echo " Keycloak OSMO Configuration Complete" echo "=========================================" - echo "Realm: osmo" - echo "Clients: osmo-device, osmo-browser-flow" - echo "Test user: osmo-admin / osmo-admin" + echo "" + echo "Realm: osmo (imported from official sample_osmo_realm.json)" + echo "Clients: osmo-device (public, device code + direct access)" + echo " osmo-browser-flow (confidential, authorization code)" + echo "Groups: Admin, User, Backend Operator" + echo "Roles: osmo-admin, osmo-user, osmo-backend, grafana-*, dashboard-*" + echo "Mappers: JWT 'roles' claim configured on both clients" + echo "Test user: osmo-admin / osmo-admin (Admin group)" echo "" EOF # Delete any previous config job kubectl delete job keycloak-osmo-setup -n "${KEYCLOAK_NAMESPACE}" --ignore-not-found 2>/dev/null || true - + kubectl apply -f /tmp/keycloak-config-job.yaml - - log_info "Waiting for Keycloak configuration job..." + + log_info "Waiting for Keycloak realm import job..." kubectl wait --for=condition=complete job/keycloak-osmo-setup \ -n "${KEYCLOAK_NAMESPACE}" --timeout=300s || { log_warning "Keycloak configuration may have failed, check logs:" kubectl logs -n "${KEYCLOAK_NAMESPACE}" -l job-name=keycloak-osmo-setup --tail=50 || true } - + # Store the client secret for OIDC (used by Envoy sidecar) kubectl create secret generic oidc-secrets \ --namespace "${OSMO_NAMESPACE}" \ --from-literal=client_secret="${OIDC_CLIENT_SECRET}" \ --from-literal=hmac_secret="$(openssl rand -base64 32)" \ --dry-run=client -o yaml | kubectl apply -f - - - rm -f /tmp/keycloak-values.yaml /tmp/keycloak-config-job.yaml - + + # Clean up temporary files and ConfigMap + rm -f /tmp/keycloak-config-job.yaml + kubectl delete configmap keycloak-realm-json -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null || true + log_success "Keycloak deployed and configured" echo "" - echo "Keycloak Access:" - echo " kubectl port-forward -n ${KEYCLOAK_NAMESPACE} svc/keycloak 8081:80" - echo " URL: http://localhost:8081" - echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" - echo " Test User: osmo-admin / osmo-admin" - echo "" - echo "OSMO Auth Endpoints (in-cluster):" - echo " Token: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/token" - echo " Auth: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/auth" - echo "" - - # Keycloak is deployed but we disable OSMO's internal auth - # because OSMO's JWT validation expects its own keys, not Keycloak's - # Users can still get tokens from Keycloak for future use - AUTH_ENABLED="false" - log_info "Note: OSMO internal auth disabled (use Keycloak tokens with API directly)" + if [[ "$KC_EXTERNAL" == "true" ]]; then + echo "Keycloak Access (external):" + echo " URL: https://${AUTH_DOMAIN}" + echo " Admin console: https://${AUTH_DOMAIN}/admin" + echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" + echo " Test User: osmo-admin / osmo-admin" + echo "" + echo "OSMO Auth Endpoints:" + echo " Token: https://${AUTH_DOMAIN}/realms/osmo/protocol/openid-connect/token" + echo " Auth: https://${AUTH_DOMAIN}/realms/osmo/protocol/openid-connect/auth" + echo "" + # Enable OSMO auth with Envoy sidecars (production mode) + AUTH_ENABLED="true" + KEYCLOAK_EXTERNAL_URL="https://${AUTH_DOMAIN}" + log_success "OSMO authentication will be ENABLED with Envoy sidecars" + else + echo "Keycloak Access (port-forward only):" + echo " kubectl port-forward -n ${KEYCLOAK_NAMESPACE} svc/keycloak 8081:80" + echo " URL: http://localhost:8081" + echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" + echo " Test User: osmo-admin / osmo-admin" + echo "" + echo "OSMO Auth Endpoints (in-cluster):" + echo " Token: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/token" + echo " Auth: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/auth" + echo "" + # Auth disabled when Keycloak is internal-only + AUTH_ENABLED="false" + KEYCLOAK_EXTERNAL_URL="" + log_info "Note: OSMO auth disabled (Keycloak is internal-only, no TLS ingress)" + log_info "To enable auth, run: DEPLOY_KEYCLOAK=true ./04-enable-tls.sh ${OSMO_INGRESS_HOSTNAME:-}" + fi else log_info "Skipping Keycloak (set DEPLOY_KEYCLOAK=true to enable)" log_warning "Without Keycloak, 'osmo login' and token creation will not work" log_info "Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak" AUTH_ENABLED="false" + KEYCLOAK_EXTERNAL_URL="" fi # ----------------------------------------------------------------------------- @@ -859,15 +1095,9 @@ else log_info "Ingress hostname: (any — IP-based access)" fi -# Auto-detect TLS certificate (created by 04-enable-tls.sh run before this script) -TLS_SECRET="${OSMO_TLS_SECRET_NAME:-osmo-tls}" -TLS_ENABLED="false" -if kubectl get secret "${TLS_SECRET}" -n "${OSMO_NAMESPACE}" &>/dev/null; then +# TLS_SECRET and TLS_ENABLED were already set earlier (before Keycloak section) +if [[ "$TLS_ENABLED" == "true" ]]; then log_success "TLS certificate detected (${TLS_SECRET}) — will create HTTPS Ingress" - TLS_ENABLED="true" -elif kubectl get certificate "${TLS_SECRET}" -n "${OSMO_NAMESPACE}" &>/dev/null; then - log_info "TLS certificate pending (${TLS_SECRET}) — will create HTTPS Ingress" - TLS_ENABLED="true" fi # Create the values file with proper extraEnv and extraVolumes for each service @@ -914,11 +1144,24 @@ services: nginx.ingress.kubernetes.io/proxy-busy-buffers-size: "32k" nginx.ingress.kubernetes.io/large-client-header-buffers: "4 16k" # Authentication configuration - # NOTE: Auth is DISABLED because OSMO's internal JWT validation expects - # tokens signed with its own keys, not Keycloak's keys. - # For testing, the API is open. For production, use network-level security. +$(if [[ "$AUTH_ENABLED" == "true" ]]; then +cat < Keycloak) + oauth2Filter: + enabled: true + tokenEndpoint: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/token + authEndpoint: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/auth + clientId: osmo-browser-flow + authProvider: ${AUTH_DOMAIN} + secretName: oidc-secrets + clientSecretKey: client_secret + hmacSecretKey: hmac_secret + + # JWT Filter config -- three providers + jwt: + user_header: x-osmo-user + providers: + # Provider 1: Keycloak device flow (CLI) + - issuer: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo + audience: osmo-device + jwks_uri: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/certs + user_claim: preferred_username + cluster: oauth + # Provider 2: Keycloak browser flow (Web UI) + - issuer: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo + audience: osmo-browser-flow + jwks_uri: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/certs + user_claim: preferred_username + cluster: oauth + # Provider 3: OSMO-signed JWTs (service accounts) + - issuer: osmo + audience: osmo + jwks_uri: http://localhost:8000/api/auth/keys + user_claim: unique_name + cluster: service +ENVOY_ENABLED +else +cat < /tmp/osmo_router_values.yaml < /tmp/osmo_ui_values.yaml </dev/null; then - CURRENT_TARGET=$(kubectl get svc "$svc" -n "${OSMO_NAMESPACE}" \ - -o jsonpath='{.spec.ports[0].targetPort}' 2>/dev/null || echo "") - - if [[ "$CURRENT_TARGET" == "envoy-http" || "$CURRENT_TARGET" == "envoy" ]]; then - log_info " Patching $svc: targetPort envoy-http -> 8000" - kubectl patch svc "$svc" -n "${OSMO_NAMESPACE}" --type='json' \ - -p='[{"op": "replace", "path": "/spec/ports/0/targetPort", "value": 8000}]' || \ - log_warning " Failed to patch $svc" - else - log_info " $svc: targetPort = $CURRENT_TARGET (OK)" + for svc in $OSMO_SERVICES; do + if kubectl get svc "$svc" -n "${OSMO_NAMESPACE}" &>/dev/null; then + CURRENT_TARGET=$(kubectl get svc "$svc" -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.spec.ports[0].targetPort}' 2>/dev/null || echo "") + + if [[ "$CURRENT_TARGET" == "envoy-http" || "$CURRENT_TARGET" == "envoy" ]]; then + log_info " Patching $svc: targetPort envoy-http -> 8000" + kubectl patch svc "$svc" -n "${OSMO_NAMESPACE}" --type='json' \ + -p='[{"op": "replace", "path": "/spec/ports/0/targetPort", "value": 8000}]' || \ + log_warning " Failed to patch $svc" + else + log_info " $svc: targetPort = $CURRENT_TARGET (OK)" + fi fi - fi -done + done -log_success "Service ports verified" + log_success "Service ports verified" +fi # ----------------------------------------------------------------------------- # Step 11: Verify Deployment @@ -1363,10 +1788,10 @@ else fi if [[ -n "$TARGET_SERVICE_URL" ]]; then - # Start port-forward to access the OSMO API - log_info "Starting port-forward to configure service_base_url..." - kubectl port-forward -n "${OSMO_NAMESPACE}" svc/osmo-service 8080:80 &>/dev/null & - _PF_PID=$! + # Start port-forward using the shared helper (auto-detects Envoy) + start_osmo_port_forward "${OSMO_NAMESPACE}" 8080 + _PF_PID=$PORT_FORWARD_PID + export _OSMO_PORT=8080 _cleanup_pf() { if [[ -n "${_PF_PID:-}" ]]; then @@ -1386,39 +1811,37 @@ if [[ -n "$TARGET_SERVICE_URL" ]]; then done if [[ "$_pf_ready" == "true" ]]; then - # Login - if osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then - # Check current value - CURRENT_SVC_URL=$(curl -s "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') + # Login (no-op when bypassing Envoy -- osmo_curl handles auth headers) + osmo_login 8080 || true - if [[ "$CURRENT_SVC_URL" == "$TARGET_SERVICE_URL" ]]; then - log_success "service_base_url already configured: ${CURRENT_SVC_URL}" - else - if [[ -n "$CURRENT_SVC_URL" && "$CURRENT_SVC_URL" != "null" ]]; then - log_warning "Updating service_base_url from '${CURRENT_SVC_URL}' to '${TARGET_SERVICE_URL}'" - fi + # Check current value + CURRENT_SVC_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') - # Write config - cat > /tmp/service_url_fix.json << SVCEOF + if [[ "$CURRENT_SVC_URL" == "$TARGET_SERVICE_URL" ]]; then + log_success "service_base_url already configured: ${CURRENT_SVC_URL}" + else + if [[ -n "$CURRENT_SVC_URL" && "$CURRENT_SVC_URL" != "null" ]]; then + log_warning "Updating service_base_url from '${CURRENT_SVC_URL}' to '${TARGET_SERVICE_URL}'" + fi + + # Write config using the PATCH API helper + cat > /tmp/service_url_fix.json << SVCEOF { "service_base_url": "${TARGET_SERVICE_URL}" } SVCEOF - if osmo config update SERVICE --file /tmp/service_url_fix.json --description "Set service_base_url for osmo-ctrl sidecar" 2>/dev/null; then - # Verify - NEW_SVC_URL=$(curl -s "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') - if [[ "$NEW_SVC_URL" == "$TARGET_SERVICE_URL" ]]; then - log_success "service_base_url configured: ${NEW_SVC_URL}" - else - log_warning "service_base_url verification failed. Run ./08-configure-service-url.sh manually." - fi + if osmo_config_update SERVICE /tmp/service_url_fix.json "Set service_base_url for osmo-ctrl sidecar"; then + # Verify + NEW_SVC_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') + if [[ "$NEW_SVC_URL" == "$TARGET_SERVICE_URL" ]]; then + log_success "service_base_url configured: ${NEW_SVC_URL}" else - log_warning "Failed to set service_base_url. Run ./08-configure-service-url.sh manually." + log_warning "service_base_url verification failed. Run ./08-configure-service-url.sh manually." fi - rm -f /tmp/service_url_fix.json + else + log_warning "Failed to set service_base_url. Run ./08-configure-service-url.sh manually." fi - else - log_warning "Could not login to OSMO. Run ./08-configure-service-url.sh manually." + rm -f /tmp/service_url_fix.json fi else log_warning "Port-forward not ready. Run ./08-configure-service-url.sh manually." @@ -1433,48 +1856,77 @@ log_success "OSMO Control Plane deployment complete!" echo "========================================" echo "" -if [[ "$TLS_ENABLED" == "true" && -n "$INGRESS_HOSTNAME" ]]; then - echo "OSMO Access (HTTPS via NGINX Ingress + cert-manager):" - echo " OSMO API: https://${INGRESS_HOSTNAME}/api/version" - echo " OSMO UI: https://${INGRESS_HOSTNAME}" - echo " OSMO CLI: osmo login https://${INGRESS_HOSTNAME} --method dev --username admin" +if [[ "$AUTH_ENABLED" == "true" ]]; then + # --- Auth-enabled output --- + echo "Authentication: ENABLED (Keycloak + Envoy sidecars)" echo "" -elif [[ -n "$INGRESS_URL" ]]; then - echo "OSMO Access (via NGINX Ingress LoadBalancer):" - echo " OSMO API: ${INGRESS_URL}/api/version" - echo " OSMO UI: ${INGRESS_URL}" - echo " OSMO CLI: osmo login ${INGRESS_URL} --method dev --username admin" + echo "Keycloak Admin Console:" + echo " URL: https://${AUTH_DOMAIN}/admin" + echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" echo "" -else - log_warning "Could not detect Ingress LoadBalancer IP." - echo " Check: kubectl get svc -n ${INGRESS_NAMESPACE:-ingress-nginx}" + echo "OSMO Access:" + if [[ -n "$INGRESS_URL" ]]; then + echo " OSMO API: ${INGRESS_URL}/api/version (unauthenticated -- skipAuthPath)" + echo " OSMO Web UI: ${INGRESS_URL} (redirects to Keycloak login)" + fi echo "" - echo " Fallback (port-forward):" - echo " kubectl port-forward -n ${OSMO_NAMESPACE} svc/osmo-service 8080:80" - echo " URL: http://localhost:8080" + echo "Login methods:" + echo " Browser: Visit ${INGRESS_URL:-https://} -- you will be redirected to Keycloak" + echo " CLI: osmo login ${INGRESS_URL:-https://}" + echo " (Opens browser for device authorization flow)" + echo "" + echo "Test user: osmo-admin / osmo-admin" + echo "" + echo "Keycloak realm management (groups, roles, users):" + echo " https://nvidia.github.io/OSMO/main/deployment_guide/appendix/authentication/keycloak_setup.html" echo "" -fi - -echo "NOTE: OSMO API authentication is DISABLED for testing." -echo " The API is accessible without tokens." -echo "" -echo "Test the API:" -if [[ -n "$INGRESS_URL" ]]; then - echo " curl ${INGRESS_URL}/api/version" - echo " curl ${INGRESS_URL}/api/workflow" else - echo " curl http://localhost:8080/api/version" - echo " curl http://localhost:8080/api/workflow" -fi -echo "" -if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then - echo "Keycloak Access (for future use):" - echo " kubectl port-forward -n ${KEYCLOAK_NAMESPACE} svc/keycloak 8081:80" - echo " URL: http://localhost:8081" - echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" - echo " Test User: osmo-admin / osmo-admin" + # --- No-auth output --- + if [[ "$TLS_ENABLED" == "true" && -n "$INGRESS_HOSTNAME" ]]; then + echo "OSMO Access (HTTPS via NGINX Ingress + cert-manager):" + echo " OSMO API: https://${INGRESS_HOSTNAME}/api/version" + echo " OSMO UI: https://${INGRESS_HOSTNAME}" + echo " OSMO CLI: osmo login https://${INGRESS_HOSTNAME} --method dev --username admin" + echo "" + elif [[ -n "$INGRESS_URL" ]]; then + echo "OSMO Access (via NGINX Ingress LoadBalancer):" + echo " OSMO API: ${INGRESS_URL}/api/version" + echo " OSMO UI: ${INGRESS_URL}" + echo " OSMO CLI: osmo login ${INGRESS_URL} --method dev --username admin" + echo "" + else + log_warning "Could not detect Ingress LoadBalancer IP." + echo " Check: kubectl get svc -n ${INGRESS_NAMESPACE:-ingress-nginx}" + echo "" + echo " Fallback (port-forward):" + echo " kubectl port-forward -n ${OSMO_NAMESPACE} svc/osmo-service 8080:80" + echo " URL: http://localhost:8080" + echo "" + fi + + echo "NOTE: OSMO API authentication is DISABLED." + echo " The API is accessible without tokens." + echo " Set DEPLOY_KEYCLOAK=true with TLS to enable Keycloak + Envoy auth." + echo "" + echo "Test the API:" + if [[ -n "$INGRESS_URL" ]]; then + echo " curl ${INGRESS_URL}/api/version" + echo " curl ${INGRESS_URL}/api/workflow" + else + echo " curl http://localhost:8080/api/version" + echo " curl http://localhost:8080/api/workflow" + fi echo "" + if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + echo "Keycloak Access (internal only, auth not enforced):" + echo " kubectl port-forward -n ${KEYCLOAK_NAMESPACE} svc/keycloak 8081:80" + echo " URL: http://localhost:8081" + echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" + echo " Test User: osmo-admin / osmo-admin" + echo "" + fi fi + echo "Ingress resources:" kubectl get ingress -n "${OSMO_NAMESPACE}" 2>/dev/null || true echo "" diff --git a/applications/osmo/deploy/002-setup/06-deploy-osmo-backend.sh b/applications/osmo/deploy/002-setup/06-deploy-osmo-backend.sh index 0b5b495d7..a1d530c61 100755 --- a/applications/osmo/deploy/002-setup/06-deploy-osmo-backend.sh +++ b/applications/osmo/deploy/002-setup/06-deploy-osmo-backend.sh @@ -93,18 +93,17 @@ if [[ -z "${OSMO_SERVICE_TOKEN:-}" ]]; then # If still no token, automatically create one using port-forward if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then log_info "No token found - automatically creating service token..." - + # Check if osmo CLI is available if ! command -v osmo &>/dev/null; then log_error "osmo CLI not found. Please install it first." exit 1 fi - - # Start port-forward in background - log_info "Starting port-forward to OSMO service..." - kubectl port-forward -n osmo svc/osmo-service 8080:80 &>/dev/null & - PORT_FORWARD_PID=$! - + + # Start port-forward using shared helper (auto-detects Envoy) + start_osmo_port_forward osmo 8080 + export _OSMO_PORT=8080 + # Cleanup function to kill port-forward on exit cleanup_port_forward() { if [[ -n "${PORT_FORWARD_PID:-}" ]]; then @@ -113,7 +112,7 @@ if [[ -z "${OSMO_SERVICE_TOKEN:-}" ]]; then fi } trap cleanup_port_forward EXIT - + # Wait for port-forward to be ready log_info "Waiting for port-forward to be ready..." max_wait=30 @@ -127,36 +126,69 @@ if [[ -z "${OSMO_SERVICE_TOKEN:-}" ]]; then fi done log_success "Port-forward ready" - - # Login with dev method (since auth is disabled) - log_info "Logging in to OSMO (dev method)..." - if ! osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then - log_error "Failed to login to OSMO" - exit 1 + + # Detect if Keycloak auth is active + KEYCLOAK_ENABLED="false" + if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]] || kubectl get svc keycloak -n osmo &>/dev/null; then + if has_envoy_sidecar osmo "app.kubernetes.io/name=service"; then + KEYCLOAK_ENABLED="true" + fi fi - log_success "Logged in successfully" - - # Create service token - TOKEN_NAME="backend-token-$(date -u +%Y%m%d%H%M%S)" - EXPIRY_DATE=$(date -u -d "+1 year" +%F 2>/dev/null || date -u -v+1y +%F 2>/dev/null || echo "2027-01-01") - - log_info "Creating service token: $TOKEN_NAME (expires: $EXPIRY_DATE)..." - TOKEN_OUTPUT=$(osmo token set "$TOKEN_NAME" \ - --expires-at "$EXPIRY_DATE" \ - --description "Backend Operator Token (auto-generated)" \ - --service --roles osmo-backend 2>&1) - - # Extract token from output (format: "Access token: ") - OSMO_SERVICE_TOKEN=$(echo "$TOKEN_OUTPUT" | sed -n 's/.*Access token: //p' | tr -d '\r' | xargs) - if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then - log_error "Failed to create service token" - echo "Output: $TOKEN_OUTPUT" - exit 1 + if [[ "$KEYCLOAK_ENABLED" == "true" ]]; then + # Keycloak + Envoy mode: use the PATCH API via pod port-forward + log_info "Keycloak auth detected — creating service token via API..." + + TOKEN_NAME="backend-token-$(date -u +%Y%m%d%H%M%S)" + EXPIRY_DATE=$(date -u -d "+1 year" +%F 2>/dev/null || date -u -v+1y +%F 2>/dev/null || echo "2027-01-01") + + TOKEN_RESPONSE=$(osmo_curl POST "http://localhost:8080/api/auth/access_token/service/${TOKEN_NAME}" \ + -d "{\"description\":\"Backend Operator Token\",\"expires_at\":\"${EXPIRY_DATE}\",\"roles\":[\"osmo-backend\"]}") + + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_RESPONSE" | jq -r '.token // empty' 2>/dev/null || echo "") + + if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then + # Fallback: try extracting from different response format + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_RESPONSE" | jq -r '.access_token // empty' 2>/dev/null || echo "") + fi + + if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then + log_error "Failed to create service token via API" + echo "Response: $TOKEN_RESPONSE" + exit 1 + fi + log_success "Service token created via API: $TOKEN_NAME" + else + # No Keycloak: use osmo CLI with dev login + log_info "Logging in to OSMO (dev method)..." + if ! osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then + log_error "Failed to login to OSMO" + exit 1 + fi + log_success "Logged in successfully" + + # Create service token + TOKEN_NAME="backend-token-$(date -u +%Y%m%d%H%M%S)" + EXPIRY_DATE=$(date -u -d "+1 year" +%F 2>/dev/null || date -u -v+1y +%F 2>/dev/null || echo "2027-01-01") + + log_info "Creating service token: $TOKEN_NAME (expires: $EXPIRY_DATE)..." + TOKEN_OUTPUT=$(osmo token set "$TOKEN_NAME" \ + --expires-at "$EXPIRY_DATE" \ + --description "Backend Operator Token (auto-generated)" \ + --service --roles osmo-backend 2>&1) + + # Extract token from output (format: "Access token: ") + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_OUTPUT" | sed -n 's/.*Access token: //p' | tr -d '\r' | xargs) + + if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then + log_error "Failed to create service token" + echo "Output: $TOKEN_OUTPUT" + exit 1 + fi + + log_success "Service token created successfully" fi - - log_success "Service token created successfully" - + # Stop port-forward (we're done with it) cleanup_port_forward trap - EXIT diff --git a/applications/osmo/deploy/002-setup/07-configure-storage.sh b/applications/osmo/deploy/002-setup/07-configure-storage.sh index 61b31b123..503c391eb 100755 --- a/applications/osmo/deploy/002-setup/07-configure-storage.sh +++ b/applications/osmo/deploy/002-setup/07-configure-storage.sh @@ -90,9 +90,9 @@ fi # ----------------------------------------------------------------------------- log_info "Starting port-forward to OSMO service..." -# Start port-forward in background -kubectl port-forward -n osmo svc/osmo-service 8080:80 &>/dev/null & -PORT_FORWARD_PID=$! +# Start port-forward using shared helper (auto-detects Envoy) +start_osmo_port_forward osmo 8080 +export _OSMO_PORT=8080 # Cleanup function cleanup_port_forward() { @@ -117,9 +117,9 @@ while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/versio done log_success "Port-forward ready" -# Login with dev method +# Login (no-op when bypassing Envoy) log_info "Logging in to OSMO..." -if ! osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then +if ! osmo_login 8080; then log_error "Failed to login to OSMO" exit 1 fi @@ -170,11 +170,10 @@ WORKFLOW_LOG_CONFIG=$(cat < /tmp/workflow_log_config.json -# Use EDITOR='tee' trick to bypass interactive editor -if osmo config update WORKFLOW --file /tmp/workflow_log_config.json --description "Configure workflow log storage" 2>/dev/null; then +if osmo_config_update WORKFLOW /tmp/workflow_log_config.json "Configure workflow log storage"; then log_success "Workflow log storage configured" else log_error "Failed to configure workflow log storage" @@ -202,11 +201,10 @@ WORKFLOW_DATA_CONFIG=$(cat < /tmp/workflow_data_config.json -# Use EDITOR='tee' trick to bypass interactive editor -if osmo config update WORKFLOW --file /tmp/workflow_data_config.json --description "Configure workflow data storage" 2>/dev/null; then +if osmo_config_update WORKFLOW /tmp/workflow_data_config.json "Configure workflow data storage"; then log_success "Workflow data storage configured" else log_error "Failed to configure workflow data storage" @@ -225,7 +223,7 @@ log_info "Verifying storage configuration..." echo "" echo "Workflow configuration:" osmo config show WORKFLOW 2>/dev/null || \ - curl -s "http://localhost:8080/api/configs/workflow" 2>/dev/null | jq '.' || \ + osmo_curl GET "http://localhost:8080/api/configs/workflow" 2>/dev/null | jq '.' || \ log_warning "Could not retrieve workflow config for verification" # Cleanup diff --git a/applications/osmo/deploy/002-setup/08-configure-service-url.sh b/applications/osmo/deploy/002-setup/08-configure-service-url.sh index 2ec685d04..f240e4e52 100755 --- a/applications/osmo/deploy/002-setup/08-configure-service-url.sh +++ b/applications/osmo/deploy/002-setup/08-configure-service-url.sh @@ -24,8 +24,9 @@ check_kubectl || exit 1 # ----------------------------------------------------------------------------- log_info "Starting port-forward to OSMO service..." -kubectl port-forward -n osmo svc/osmo-service 8080:80 &>/dev/null & -PORT_FORWARD_PID=$! +# Start port-forward using shared helper (auto-detects Envoy) +start_osmo_port_forward osmo 8080 +export _OSMO_PORT=8080 cleanup_port_forward() { if [[ -n "${PORT_FORWARD_PID:-}" ]]; then @@ -49,9 +50,9 @@ while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/versio done log_success "Port-forward ready" -# Login +# Login (no-op when bypassing Envoy) log_info "Logging in to OSMO..." -if ! osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then +if ! osmo_login 8080; then log_error "Failed to login to OSMO" exit 1 fi @@ -83,7 +84,7 @@ fi # ----------------------------------------------------------------------------- log_info "Checking current service_base_url..." -CURRENT_URL=$(curl -s "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') +CURRENT_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') echo "Current service_base_url: '${CURRENT_URL}'" if [[ -n "$CURRENT_URL" && "$CURRENT_URL" != "null" && "$CURRENT_URL" == "$SERVICE_URL" ]]; then @@ -107,7 +108,7 @@ cat > /tmp/service_url_fix.json << EOF } EOF -if osmo config update SERVICE --file /tmp/service_url_fix.json --description "Set service_base_url for osmo-ctrl sidecar" 2>/dev/null; then +if osmo_config_update SERVICE /tmp/service_url_fix.json "Set service_base_url for osmo-ctrl sidecar"; then log_success "service_base_url configured" else log_error "Failed to configure service_base_url" @@ -122,7 +123,7 @@ rm -f /tmp/service_url_fix.json # ----------------------------------------------------------------------------- log_info "Verifying configuration..." -NEW_URL=$(curl -s "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') +NEW_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') if [[ "$NEW_URL" == "$SERVICE_URL" ]]; then log_success "service_base_url verified: ${NEW_URL}" diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-keycloak.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-keycloak.sh new file mode 100755 index 000000000..caeaa8b74 --- /dev/null +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-keycloak.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Uninstall Keycloak and disable OSMO authentication +# This removes Keycloak and related secrets. After running this, re-deploy +# OSMO control plane without DEPLOY_KEYCLOAK to switch back to open API mode. +set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +OSMO_NS="${OSMO_NAMESPACE:-osmo}" +KC_TLS_SECRET="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" +INGRESS_NS="${INGRESS_NAMESPACE:-ingress-nginx}" + +echo "" +echo "========================================" +echo " Uninstall Keycloak" +echo "========================================" +echo "" + +check_kubectl || exit 1 +check_helm || exit 1 + +# Step 1: Uninstall Keycloak Helm release +log_info "Uninstalling Keycloak Helm release..." +helm uninstall keycloak --namespace "${OSMO_NS}" 2>/dev/null || log_info "Keycloak Helm release not found (already removed)" + +# Step 2: Delete Keycloak config job and realm ConfigMap +log_info "Cleaning up Keycloak configuration job and ConfigMap..." +kubectl delete job keycloak-osmo-setup -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true +kubectl delete configmap keycloak-realm-json -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + +# Step 3: Delete Keycloak-related secrets +log_info "Deleting Keycloak secrets..." +kubectl delete secret keycloak-admin-secret -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true +kubectl delete secret keycloak-db-secret -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true +kubectl delete secret oidc-secrets -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true +log_success "Keycloak secrets deleted" + +# Step 4: Delete Keycloak TLS secret +log_info "Deleting Keycloak TLS secret (${KC_TLS_SECRET})..." +kubectl delete secret "${KC_TLS_SECRET}" -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true +kubectl delete secret "${KC_TLS_SECRET}" -n "${INGRESS_NS}" --ignore-not-found 2>/dev/null || true +log_success "Keycloak TLS secrets deleted" + +# Step 5: Delete Keycloak PVCs (if any) +log_info "Cleaning up Keycloak PVCs..." +kubectl delete pvc -l app.kubernetes.io/name=keycloak -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + +echo "" +log_success "Keycloak uninstalled" +echo "" +echo "Next steps:" +echo " 1. Re-deploy OSMO control plane without authentication:" +echo " unset DEPLOY_KEYCLOAK" +echo " ./05-deploy-osmo-control-plane.sh" +echo "" +echo " 2. (Optional) Drop the Keycloak database from PostgreSQL:" +echo " Connect to your Managed PostgreSQL and run:" +echo " DROP DATABASE IF EXISTS keycloak;" +echo "" +echo " 3. (Optional) Remove the DNS A record for the auth subdomain" +echo "" diff --git a/applications/osmo/deploy/002-setup/defaults.sh b/applications/osmo/deploy/002-setup/defaults.sh index cbb43bdcc..7938b3d4b 100755 --- a/applications/osmo/deploy/002-setup/defaults.sh +++ b/applications/osmo/deploy/002-setup/defaults.sh @@ -43,6 +43,15 @@ export OSMO_INGRESS_HOSTNAME="${OSMO_INGRESS_HOSTNAME:-}" # Override for the service_base_url used by osmo-ctrl. Auto-detected from the ingress LoadBalancer if empty. export OSMO_INGRESS_BASE_URL="${OSMO_INGRESS_BASE_URL:-}" +# Keycloak / Authentication +# Set DEPLOY_KEYCLOAK=true to deploy Keycloak and enable OSMO auth with Envoy sidecars. +export DEPLOY_KEYCLOAK="${DEPLOY_KEYCLOAK:-false}" +# Keycloak hostname (e.g. auth.osmo.example.com). +# Auto-derived from OSMO_INGRESS_HOSTNAME if empty: auth.. +export KEYCLOAK_HOSTNAME="${KEYCLOAK_HOSTNAME:-}" +# TLS secret name for the Keycloak ingress (separate from the main osmo-tls). +export KEYCLOAK_TLS_SECRET_NAME="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" + # Paths export SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" export VALUES_DIR="${SCRIPT_DIR}/values" diff --git a/applications/osmo/deploy/002-setup/lib/common.sh b/applications/osmo/deploy/002-setup/lib/common.sh index c2e4a1343..60cb9505a 100755 --- a/applications/osmo/deploy/002-setup/lib/common.sh +++ b/applications/osmo/deploy/002-setup/lib/common.sh @@ -202,7 +202,11 @@ wait_for_pods() { # Detect OSMO service URL from the NGINX Ingress Controller's LoadBalancer. # +# When TLS_ENABLED=true and OSMO_INGRESS_HOSTNAME is set, returns https://. +# Otherwise falls back to http://. +# # Lookup order: +# 0. If TLS enabled + hostname set, return https:// immediately # 1. LoadBalancer external IP (cloud assigns a public/internal IP) # 2. LoadBalancer hostname (some clouds return a DNS name instead) # 3. Controller ClusterIP (fallback – works from inside the cluster) @@ -212,7 +216,17 @@ wait_for_pods() { # [[ -n "$url" ]] && echo "OSMO reachable at $url" detect_service_url() { local ns="${INGRESS_NAMESPACE:-ingress-nginx}" - local url="" + local tls_enabled="${TLS_ENABLED:-false}" + local hostname="${OSMO_INGRESS_HOSTNAME:-}" + local scheme="http" + + if [[ "$tls_enabled" == "true" ]]; then + scheme="https" + if [[ -n "$hostname" ]]; then + echo "${scheme}://${hostname}" + return 0 + fi + fi # Find the controller service (works for the community ingress-nginx chart) local lb_ip lb_host cluster_ip svc_name @@ -225,7 +239,7 @@ detect_service_url() { lb_ip=$(kubectl get svc "$svc_name" -n "$ns" \ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) if [[ -n "$lb_ip" ]]; then - echo "http://${lb_ip}" + echo "${scheme}://${lb_ip}" return 0 fi @@ -233,7 +247,7 @@ detect_service_url() { lb_host=$(kubectl get svc "$svc_name" -n "$ns" \ -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || true) if [[ -n "$lb_host" ]]; then - echo "http://${lb_host}" + echo "${scheme}://${lb_host}" return 0 fi @@ -241,7 +255,7 @@ detect_service_url() { cluster_ip=$(kubectl get svc "$svc_name" -n "$ns" \ -o jsonpath='{.spec.clusterIP}' 2>/dev/null || true) if [[ -n "$cluster_ip" && "$cluster_ip" != "None" ]]; then - echo "http://${cluster_ip}" + echo "${scheme}://${cluster_ip}" return 0 fi fi @@ -250,6 +264,102 @@ detect_service_url() { return 1 } +# --------------------------------------------------------------------------- +# Envoy-aware helpers +# When Envoy sidecar is present, these helpers port-forward directly to the +# OSMO pod on port 8000 (bypassing Envoy) and inject x-osmo-user / x-osmo-roles +# headers so the API recognises the caller as an admin. +# --------------------------------------------------------------------------- + +# Check whether a pod matching a label selector has an "envoy" container. +# Usage: has_envoy_sidecar +has_envoy_sidecar() { + local ns="${1:-osmo}" + # The OSMO Helm chart uses the label "app=osmo-service" (not app.kubernetes.io/name) + local label="${2:-app=osmo-service}" + kubectl get pods -n "$ns" -l "$label" -o jsonpath='{.items[0].spec.containers[*].name}' 2>/dev/null | grep -qw envoy +} + +# Port-forward to the OSMO service. +# When Envoy is present, forwards to the first matching *pod* on port 8000 +# (direct access, no auth). Otherwise forwards to svc/osmo-service:80. +# Sets PORT_FORWARD_PID and _OSMO_AUTH_BYPASS. +# Usage: start_osmo_port_forward +start_osmo_port_forward() { + local ns="${1:-osmo}" + local local_port="${2:-8080}" + + if has_envoy_sidecar "$ns" "app=osmo-service"; then + log_info "Envoy sidecar detected — port-forwarding to pod:8000 (bypass Envoy)" + local pod_name + pod_name=$(kubectl get pods -n "$ns" -l app=osmo-service \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + kubectl port-forward -n "$ns" "pod/${pod_name}" "${local_port}:8000" &>/dev/null & + PORT_FORWARD_PID=$! + _OSMO_AUTH_BYPASS="true" + else + log_info "No Envoy sidecar — port-forwarding to svc/osmo-service:80" + kubectl port-forward -n "$ns" svc/osmo-service "${local_port}:80" &>/dev/null & + PORT_FORWARD_PID=$! + _OSMO_AUTH_BYPASS="false" + fi + export PORT_FORWARD_PID _OSMO_AUTH_BYPASS +} + +# Wrapper around curl that injects admin headers when bypassing Envoy. +# Usage: osmo_curl [extra_curl_args...] +osmo_curl() { + local method="$1"; shift + local url="$1"; shift + local extra_args=("$@") + + if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then + curl -s -X "$method" "$url" \ + -H "x-osmo-user: osmo-admin" \ + -H "x-osmo-roles: osmo-admin,osmo-user" \ + -H "Content-Type: application/json" \ + "${extra_args[@]}" + else + curl -s -X "$method" "$url" \ + -H "Content-Type: application/json" \ + "${extra_args[@]}" + fi +} + +# Login to OSMO via the CLI. No-op when bypassing Envoy (headers handle auth). +# Usage: osmo_login +osmo_login() { + local port="${1:-8080}" + if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then + log_info "Auth bypass active — skipping osmo login" + return 0 + fi + osmo login "http://localhost:${port}" --method dev --username admin 2>/dev/null +} + +# Update an OSMO config. +# When bypassing Envoy, uses the PATCH API with configs_dict wrapper +# (avoids the "osmo config update" CLI which may not work without a real session). +# Otherwise delegates to `osmo config update`. +# Usage: osmo_config_update +osmo_config_update() { + local config_type="$1" + local json_file="$2" + local description="${3:-Update config}" + local port="${_OSMO_PORT:-8080}" + local config_type_lower + config_type_lower=$(echo "$config_type" | tr '[:upper:]' '[:lower:]') + + if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then + # Wrap the JSON in configs_dict and PATCH directly + local payload + payload=$(jq -n --argjson data "$(cat "$json_file")" '{"configs_dict": $data}') + osmo_curl PATCH "http://localhost:${port}/api/configs/${config_type_lower}" -d "$payload" + else + osmo config update "$config_type" --file "$json_file" --description "$description" 2>/dev/null + fi +} + # Get Terraform output (supports nested values like "postgresql.host") get_tf_output() { local name=$1 diff --git a/applications/osmo/deploy/002-setup/sample_osmo_realm.json b/applications/osmo/deploy/002-setup/sample_osmo_realm.json new file mode 100644 index 000000000..54a65ed77 --- /dev/null +++ b/applications/osmo/deploy/002-setup/sample_osmo_realm.json @@ -0,0 +1,2636 @@ +{ + "id": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "realm": "osmo", + "notBefore": 0, + "defaultSignatureAlgorithm": "RS256", + "revokeRefreshToken": false, + "refreshTokenMaxReuse": 0, + "accessTokenLifespan": 300, + "accessTokenLifespanForImplicitFlow": 900, + "ssoSessionIdleTimeout": 604800, + "ssoSessionMaxLifespan": 604800, + "ssoSessionIdleTimeoutRememberMe": 0, + "ssoSessionMaxLifespanRememberMe": 0, + "offlineSessionIdleTimeout": 2592000, + "offlineSessionMaxLifespanEnabled": false, + "offlineSessionMaxLifespan": 5184000, + "clientSessionIdleTimeout": 0, + "clientSessionMaxLifespan": 0, + "clientOfflineSessionIdleTimeout": 0, + "clientOfflineSessionMaxLifespan": 0, + "accessCodeLifespan": 60, + "accessCodeLifespanUserAction": 300, + "accessCodeLifespanLogin": 1800, + "actionTokenGeneratedByAdminLifespan": 43200, + "actionTokenGeneratedByUserLifespan": 300, + "oauth2DeviceCodeLifespan": 600, + "oauth2DevicePollingInterval": 5, + "enabled": true, + "sslRequired": "external", + "registrationAllowed": false, + "registrationEmailAsUsername": false, + "rememberMe": false, + "verifyEmail": false, + "loginWithEmailAllowed": false, + "duplicateEmailsAllowed": false, + "resetPasswordAllowed": false, + "editUsernameAllowed": false, + "bruteForceProtected": true, + "permanentLockout": false, + "maxTemporaryLockouts": 0, + "bruteForceStrategy": "MULTIPLE", + "maxFailureWaitSeconds": 300, + "minimumQuickLoginWaitSeconds": 60, + "waitIncrementSeconds": 60, + "quickLoginCheckMilliSeconds": 1000, + "maxDeltaTimeSeconds": 43200, + "failureFactor": 30, + "roles": { + "realm": [ + { + "id": "2fbf71d8-d3c1-4de3-8c08-ae55b254e092", + "name": "uma_authorization", + "description": "${role_uma_authorization}", + "composite": false, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "attributes": {} + }, + { + "id": "e22b93a7-88eb-4f66-a5cc-7c68a35d72fb", + "name": "offline_access", + "description": "${role_offline-access}", + "composite": false, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "attributes": {} + }, + { + "id": "c3d524ce-b3c8-42fd-9e6b-777a32960bb2", + "name": "admin", + "description": "${role_admin}", + "composite": true, + "composites": { + "realm": [ + "create-realm" + ], + "client": { + "realm-management": [ + "manage-realm", + "query-clients", + "view-users", + "manage-identity-providers", + "impersonation", + "view-events", + "manage-authorization", + "query-realms", + "manage-clients", + "view-clients", + "create-client", + "query-groups", + "view-identity-providers", + "view-realm", + "view-authorization", + "manage-users", + "query-users", + "manage-events" + ] + } + }, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "attributes": {} + }, + { + "id": "996ba034-02ae-40d4-8d14-735506151057", + "name": "default-roles-osmo", + "description": "${role_default-roles}", + "composite": true, + "composites": { + "realm": [ + "offline_access", + "uma_authorization" + ], + "client": { + "account": [ + "manage-account", + "view-profile" + ] + } + }, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "attributes": {} + }, + { + "id": "f5584dff-7c44-4204-b387-e3caf8ea3f46", + "name": "create-realm", + "description": "${role_create-realm}", + "composite": false, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "attributes": {} + } + ], + "client": { + "osmo-realm": [], + "realm-management": [ + { + "id": "b8b96d4c-fc77-4e20-bc64-4918144dfdcf", + "name": "manage-realm", + "description": "${role_manage-realm}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "1dbd9f8f-e5e6-41b3-ba7c-746835fd9b79", + "name": "query-clients", + "description": "${role_query-clients}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "d27fc846-afad-42f9-8b11-636f4c535a36", + "name": "view-users", + "description": "${role_view-users}", + "composite": true, + "composites": { + "client": { + "realm-management": [ + "query-groups", + "query-users" + ] + } + }, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "3c345b77-4bdb-4360-bf81-fe85a77cbff7", + "name": "manage-identity-providers", + "description": "${role_manage-identity-providers}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "4953639a-2db7-45d7-a734-c42b487647c5", + "name": "impersonation", + "description": "${role_impersonation}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "ae14995a-6e23-4b1d-a10d-dd0feebf1d4a", + "name": "view-events", + "description": "${role_view-events}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "5ae16954-f8ad-4237-be92-1eb6916ce6cb", + "name": "manage-authorization", + "description": "${role_manage-authorization}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "7663ba0a-60f3-46bb-9232-3a2cc1832e62", + "name": "query-realms", + "description": "${role_query-realms}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "121f50ad-06c7-4541-a40f-400710228515", + "name": "manage-clients", + "description": "${role_manage-clients}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "d8c6a12c-240c-415c-9299-30f5292d2b90", + "name": "view-clients", + "description": "${role_view-clients}", + "composite": true, + "composites": { + "client": { + "realm-management": [ + "query-clients" + ] + } + }, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "70ebf14f-cf79-4ad7-b4c4-3d5289288ec0", + "name": "create-client", + "description": "${role_create-client}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "1abf94ab-c2a7-469c-b081-584fbbb66046", + "name": "query-groups", + "description": "${role_query-groups}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "f8e1d204-7b77-446a-84fb-675c8c85e1f1", + "name": "realm-admin", + "description": "${role_realm-admin}", + "composite": true, + "composites": { + "client": { + "realm-management": [ + "manage-realm", + "query-clients", + "view-users", + "manage-identity-providers", + "impersonation", + "view-events", + "manage-authorization", + "query-realms", + "manage-clients", + "view-clients", + "create-client", + "query-groups", + "view-identity-providers", + "view-realm", + "view-authorization", + "manage-users", + "query-users", + "manage-events" + ] + } + }, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "72066e7f-f80f-4008-a0b3-531d3aebd2f0", + "name": "view-identity-providers", + "description": "${role_view-identity-providers}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "29649597-fdc9-4330-a96d-94218a1e91b2", + "name": "view-realm", + "description": "${role_view-realm}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "12c80e9d-c3d9-4e61-91ab-c986e3aafe48", + "name": "view-authorization", + "description": "${role_view-authorization}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "bde16849-39b1-4c85-985d-40e9a178e873", + "name": "manage-users", + "description": "${role_manage-users}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "62463d22-8113-41e0-af6a-fa81883c475d", + "name": "query-users", + "description": "${role_query-users}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "e1afbd19-239f-4e78-abd9-5019b6baa7e2", + "name": "manage-events", + "description": "${role_manage-events}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + } + ], + "osmo-browser-flow": [ + { + "id": "2cfce9e9-000e-4de8-a0b6-50f0a4252db3", + "name": "dashboard-admin", + "description": "Able to make change to the kubernetes dashboard", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + }, + { + "id": "454726d1-4f76-47f6-bcfa-5d64f759134f", + "name": "grafana-user", + "description": "Able to view dashboards in grafana", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + }, + { + "id": "9d91ae54-e69b-46e8-baee-7a16f044ded1", + "name": "osmo-user", + "description": "A regular user of osmo who can submit and query workflows and datasets", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + }, + { + "id": "9ec3a04d-49a4-414b-9e2f-35b70bbea18b", + "name": "dashboard-user", + "description": "Able to view the kubernetes dashboard", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + }, + { + "id": "dfd62581-88c7-4ebb-beac-7555d1aef105", + "name": "grafana-admin", + "description": "", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + }, + { + "id": "aa86ac92-9df4-499c-9f78-e3ed600ddc15", + "name": "osmo-admin", + "description": "Admin access to the osmo service", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + } + ], + "security-admin-console": [], + "admin-cli": [], + "account-console": [], + "broker": [ + { + "id": "44300967-5867-4c57-a59a-5b8302cb8323", + "name": "read-token", + "description": "${role_read-token}", + "composite": false, + "clientRole": true, + "containerId": "6fdf7b8e-1146-4dd9-a3dc-dd93e877cf2a", + "attributes": {} + } + ], + "osmo-device": [ + { + "id": "e126038f-20eb-4d31-a95b-e5267eb8c7f1", + "name": "osmo-user", + "description": "", + "composite": false, + "clientRole": true, + "containerId": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", + "attributes": {} + }, + { + "id": "20874405-f96b-456b-a3b8-86cfe8740144", + "name": "osmo-admin", + "description": "Admin access to the osmo service", + "composite": false, + "clientRole": true, + "containerId": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", + "attributes": {} + }, + { + "id": "94a41f7f-9927-489f-aa76-a9e3dafb4ed5", + "name": "osmo-backend", + "description": "", + "composite": false, + "clientRole": true, + "containerId": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", + "attributes": {} + } + ], + "account": [ + { + "id": "358c4e88-41b8-458b-83d9-e4c86a357095", + "name": "manage-account-links", + "description": "${role_manage-account-links}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "499f54a7-ccc5-4fef-bece-9ccdc6a80308", + "name": "manage-consent", + "description": "${role_manage-consent}", + "composite": true, + "composites": { + "client": { + "account": [ + "view-consent" + ] + } + }, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "f14ea475-e733-4f69-8475-693da2992a72", + "name": "view-applications", + "description": "${role_view-applications}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "aea168f8-7115-468b-9118-aae87937dee9", + "name": "view-consent", + "description": "${role_view-consent}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "47acd969-e55d-4382-946b-67fb2e4bb119", + "name": "manage-account", + "description": "${role_manage-account}", + "composite": true, + "composites": { + "client": { + "account": [ + "manage-account-links" + ] + } + }, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "102cd4a5-8e95-4d3c-87de-a98c2958f5c0", + "name": "view-groups", + "description": "${role_view-groups}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "b6da542f-977e-437e-8d24-6cb4ed4612af", + "name": "delete-account", + "description": "${role_delete-account}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "2da758ad-a74d-43ef-b911-6b52c8b60d90", + "name": "view-profile", + "description": "${role_view-profile}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + } + ] + } + }, + "groups": [ + { + "id": "979a1cd5-b392-4905-a868-17603faf9ca9", + "name": "Admin", + "path": "/Admin", + "subGroups": [], + "attributes": {}, + "realmRoles": [], + "clientRoles": { + "osmo-browser-flow": [ + "osmo-user", + "osmo-admin" + ], + "osmo-device": [ + "osmo-user", + "osmo-admin" + ] + } + }, + { + "id": "2fc39861-b636-47c8-b57b-d1719466759c", + "name": "Backend Operator", + "path": "/Backend Operator", + "subGroups": [], + "attributes": {}, + "realmRoles": [], + "clientRoles": { + "osmo-device": [ + "osmo-backend" + ] + } + }, + { + "id": "57a9b7f0-36ec-46c5-9781-49d53b1c6468", + "name": "User", + "path": "/User", + "subGroups": [], + "attributes": {}, + "realmRoles": [], + "clientRoles": { + "osmo-browser-flow": [ + "osmo-user", + "grafana-user", + "dashboard-user" + ], + "osmo-device": [ + "osmo-user" + ] + } + } + ], + "defaultRole": { + "id": "996ba034-02ae-40d4-8d14-735506151057", + "name": "default-roles-osmo", + "description": "${role_default-roles}", + "composite": true, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c" + }, + "requiredCredentials": [ + "password" + ], + "otpPolicyType": "totp", + "otpPolicyAlgorithm": "HmacSHA1", + "otpPolicyInitialCounter": 0, + "otpPolicyDigits": 6, + "otpPolicyLookAheadWindow": 1, + "otpPolicyPeriod": 30, + "otpPolicyCodeReusable": false, + "otpSupportedApplications": [ + "totpAppFreeOTPName", + "totpAppGoogleName", + "totpAppMicrosoftAuthenticatorName" + ], + "localizationTexts": {}, + "webAuthnPolicyRpEntityName": "keycloak", + "webAuthnPolicySignatureAlgorithms": [ + "ES256" + ], + "webAuthnPolicyRpId": "", + "webAuthnPolicyAttestationConveyancePreference": "not specified", + "webAuthnPolicyAuthenticatorAttachment": "not specified", + "webAuthnPolicyRequireResidentKey": "not specified", + "webAuthnPolicyUserVerificationRequirement": "not specified", + "webAuthnPolicyCreateTimeout": 0, + "webAuthnPolicyAvoidSameAuthenticatorRegister": false, + "webAuthnPolicyAcceptableAaguids": [], + "webAuthnPolicyExtraOrigins": [], + "webAuthnPolicyPasswordlessRpEntityName": "keycloak", + "webAuthnPolicyPasswordlessSignatureAlgorithms": [ + "ES256" + ], + "webAuthnPolicyPasswordlessRpId": "", + "webAuthnPolicyPasswordlessAttestationConveyancePreference": "not specified", + "webAuthnPolicyPasswordlessAuthenticatorAttachment": "not specified", + "webAuthnPolicyPasswordlessRequireResidentKey": "not specified", + "webAuthnPolicyPasswordlessUserVerificationRequirement": "not specified", + "webAuthnPolicyPasswordlessCreateTimeout": 0, + "webAuthnPolicyPasswordlessAvoidSameAuthenticatorRegister": false, + "webAuthnPolicyPasswordlessAcceptableAaguids": [], + "webAuthnPolicyPasswordlessExtraOrigins": [], + "scopeMappings": [ + { + "clientScope": "offline_access", + "roles": [ + "offline_access" + ] + } + ], + "clientScopeMappings": { + "account": [ + { + "client": "account-console", + "roles": [ + "manage-account", + "view-groups" + ] + } + ] + }, + "clients": [ + { + "id": "049b45a3-ba14-4735-8168-c9be73625a6f", + "clientId": "account", + "name": "${client_account}", + "rootUrl": "${authBaseUrl}", + "baseUrl": "/realms/osmo/account/", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [ + "/realms/osmo/account/*" + ], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": true, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "post.logout.redirect.uris": "+" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": false, + "nodeReRegistrationTimeout": 0, + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "a18dadb1-a13d-4523-8e33-446ff5781676", + "clientId": "account-console", + "name": "${client_account-console}", + "rootUrl": "${authBaseUrl}", + "baseUrl": "/realms/osmo/account/", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [ + "/realms/osmo/account/*" + ], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": true, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "post.logout.redirect.uris": "+", + "pkce.code.challenge.method": "S256" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": false, + "nodeReRegistrationTimeout": 0, + "protocolMappers": [ + { + "id": "d3db99fd-64a1-48b8-82bd-a92533e2fd4c", + "name": "audience resolve", + "protocol": "openid-connect", + "protocolMapper": "oidc-audience-resolve-mapper", + "consentRequired": false, + "config": {} + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "14047566-1501-4403-92c7-418ef38e3ba4", + "clientId": "admin-cli", + "name": "${client_admin-cli}", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": false, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": true, + "serviceAccountsEnabled": false, + "publicClient": true, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "client.use.lightweight.access.token.enabled": "true", + "post.logout.redirect.uris": "+" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": 0, + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "6fdf7b8e-1146-4dd9-a3dc-dd93e877cf2a", + "clientId": "broker", + "name": "${client_broker}", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": true, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": false, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "true", + "post.logout.redirect.uris": "+" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": false, + "nodeReRegistrationTimeout": 0, + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "clientId": "osmo-browser-flow", + "name": "Osmo Browser Flow", + "description": "Allow logging into osmo using the authorization code based browser flow", + "rootUrl": "https://default.com", + "adminUrl": "https://default.com", + "baseUrl": "https://default.com/docs", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "secret": "**********", + "redirectUris": [ + "", + "https://default.com/setup/getAToken", + "https://default.com/getAToken", + "https://default.com/api/auth/getAToken" + ], + "webOrigins": [ + "*", + "https://default.com" + ], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": false, + "frontchannelLogout": true, + "protocol": "openid-connect", + "attributes": { + "client.secret.creation.time": "1762965594", + "post.logout.redirect.uris": "+", + "frontchannel.logout.session.required": "true", + "oauth2.device.authorization.grant.enabled": "false", + "backchannel.logout.revoke.offline.tokens": "false", + "use.refresh.tokens": "true", + "realm_client": "false", + "oidc.ciba.grant.enabled": "false", + "backchannel.logout.session.required": "true", + "client_credentials.use_refresh_token": "false", + "acr.loa.map": "{}", + "require.pushed.authorization.requests": "false", + "tls.client.certificate.bound.access.tokens": "false", + "display.on.consent.screen": "false", + "token.response.type.bearer.lower-case": "false" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": -1, + "protocolMappers": [ + { + "id": "8fcbb19c-503b-4173-a35b-69cc23bc112f", + "name": "Create \"roles\" claim", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-client-role-mapper", + "consentRequired": false, + "config": { + "multivalued": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "roles", + "jsonType.label": "String", + "usermodel.clientRoleMapping.clientId": "osmo-browser-flow" + } + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", + "clientId": "osmo-device", + "name": "Osmo device flow", + "description": "Allow login with devices such as cli", + "rootUrl": "https://default.com", + "adminUrl": "https://default.com", + "baseUrl": "https://default.com", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [ + "https://default.com/*" + ], + "webOrigins": [ + "https://default.com" + ], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": false, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": true, + "serviceAccountsEnabled": false, + "publicClient": true, + "frontchannelLogout": true, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "oidc.ciba.grant.enabled": "false", + "backchannel.logout.session.required": "true", + "post.logout.redirect.uris": "+", + "frontchannel.logout.session.required": "true", + "display.on.consent.screen": "false", + "oauth2.device.authorization.grant.enabled": "true", + "backchannel.logout.revoke.offline.tokens": "false" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": -1, + "protocolMappers": [ + { + "id": "21f8be09-ffc5-4a26-855b-6be4ab297c67", + "name": "Create \"roles\" claim", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-client-role-mapper", + "consentRequired": false, + "config": { + "multivalued": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "roles", + "jsonType.label": "String", + "usermodel.clientRoleMapping.clientId": "osmo-device" + } + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "06a0fe4b-c247-4233-af67-78138bf5337a", + "clientId": "osmo-realm", + "name": "OSMO Realm", + "description": "", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": true, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": false, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "oidc.ciba.grant.enabled": "false", + "backchannel.logout.session.required": "true", + "post.logout.redirect.uris": "+", + "oauth2.device.authorization.grant.enabled": "false", + "backchannel.logout.revoke.offline.tokens": "false" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": false, + "nodeReRegistrationTimeout": 0, + "defaultClientScopes": [], + "optionalClientScopes": [] + }, + { + "id": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "clientId": "realm-management", + "name": "${client_realm-management}", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": true, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": false, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "true", + "post.logout.redirect.uris": "+" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": false, + "nodeReRegistrationTimeout": 0, + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "c70e9b76-96a2-41da-84da-df8b9e0d228d", + "clientId": "security-admin-console", + "name": "${client_security-admin-console}", + "rootUrl": "${authAdminUrl}", + "baseUrl": "/admin/osmo/console/", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [ + "/admin/osmo/console/*" + ], + "webOrigins": [ + "+" + ], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": true, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "client.use.lightweight.access.token.enabled": "true", + "post.logout.redirect.uris": "+", + "pkce.code.challenge.method": "S256" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": 0, + "protocolMappers": [ + { + "id": "e921764f-2d7f-4a08-833c-204801a096db", + "name": "locale", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "locale", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "locale", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + } + ], + "clientScopes": [ + { + "id": "e172a6de-ad7d-4cbd-be06-010d284b6806", + "name": "basic", + "description": "OpenID Connect scope for add all basic claims to the token", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "false", + "display.on.consent.screen": "false" + }, + "protocolMappers": [ + { + "id": "e67f2d9e-7cf0-4875-a72d-ce4a086adf7b", + "name": "auth_time", + "protocol": "openid-connect", + "protocolMapper": "oidc-usersessionmodel-note-mapper", + "consentRequired": false, + "config": { + "user.session.note": "AUTH_TIME", + "introspection.token.claim": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "auth_time", + "jsonType.label": "long" + } + }, + { + "id": "eba73e8f-7d13-46c7-9e6e-44e8839b1022", + "name": "sub", + "protocol": "openid-connect", + "protocolMapper": "oidc-sub-mapper", + "consentRequired": false, + "config": { + "access.token.claim": "true", + "introspection.token.claim": "true" + } + } + ] + }, + { + "id": "76307a43-d2c9-40df-a686-6c4c10e0f70d", + "name": "address", + "description": "OpenID Connect built-in scope: address", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "true", + "consent.screen.text": "${addressScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "32ac1e8f-3680-4c50-8bb4-7eed44c679b1", + "name": "address", + "protocol": "openid-connect", + "protocolMapper": "oidc-address-mapper", + "consentRequired": false, + "config": { + "user.attribute.formatted": "formatted", + "user.attribute.country": "country", + "user.attribute.postal_code": "postal_code", + "userinfo.token.claim": "true", + "user.attribute.street": "street", + "id.token.claim": "true", + "user.attribute.region": "region", + "access.token.claim": "true", + "user.attribute.locality": "locality" + } + } + ] + }, + { + "id": "67a444ee-3246-4878-a525-e0015e9b31cb", + "name": "offline_access", + "description": "OpenID Connect built-in scope: offline_access", + "protocol": "openid-connect", + "attributes": { + "consent.screen.text": "${offlineAccessScopeConsentText}", + "display.on.consent.screen": "true" + } + }, + { + "id": "1e8f098a-66fe-4df2-9547-47be0d040c53", + "name": "email", + "description": "OpenID Connect built-in scope: email", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "true", + "consent.screen.text": "${emailScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "00e95ac6-b825-4180-9558-4dffeac9584a", + "name": "email", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "email", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "email", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "9f5125d5-3b89-4f0f-a13e-b8fbb4d6afc1", + "name": "email verified", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-property-mapper", + "consentRequired": false, + "config": { + "user.attribute": "emailVerified", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "email_verified", + "jsonType.label": "boolean", + "userinfo.token.claim": "true" + } + } + ] + }, + { + "id": "988f9517-5cd2-4b66-90ba-3399d667d0f8", + "name": "role_list", + "description": "SAML role list", + "protocol": "saml", + "attributes": { + "consent.screen.text": "${samlRoleListScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "b78abf35-1108-40e2-a3c8-c6ea4200e817", + "name": "role list", + "protocol": "saml", + "protocolMapper": "saml-role-list-mapper", + "consentRequired": false, + "config": { + "single": "false", + "attribute.nameformat": "Basic", + "attribute.name": "Role" + } + } + ] + }, + { + "id": "f1dcc0f6-63be-4f85-a8cd-d43072e0eba4", + "name": "microprofile-jwt", + "description": "Microprofile - JWT built-in scope", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "true", + "display.on.consent.screen": "false" + }, + "protocolMappers": [ + { + "id": "bf488bdc-2622-45f0-95c2-df2d05fd3fab", + "name": "upn", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "username", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "upn", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "5aa8e8c1-f0d7-46c4-b2da-24aa9608da9f", + "name": "groups", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-realm-role-mapper", + "consentRequired": false, + "config": { + "multivalued": "true", + "userinfo.token.claim": "true", + "user.attribute": "foo", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "groups", + "jsonType.label": "String" + } + } + ] + }, + { + "id": "fe58e218-3aac-4780-8b5e-b61491cd457b", + "name": "profile", + "description": "OpenID Connect built-in scope: profile", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "true", + "consent.screen.text": "${profileScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "e0616aae-d3e0-4911-98b2-db72ad142938", + "name": "nickname", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "nickname", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "nickname", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "49cc1e1d-9401-4b57-b8a9-a37573f2eb06", + "name": "profile", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "profile", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "profile", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "e05eea05-f917-4ef3-a82f-501c82192bd6", + "name": "gender", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "gender", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "gender", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "89c031e1-bfad-4afd-af24-51db2c62a11f", + "name": "username", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "username", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "preferred_username", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "30d27d3e-3b72-49d1-a66f-0466b58dbf3b", + "name": "locale", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "locale", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "locale", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "9fc26d9e-c109-4b30-8ec2-2fc2d95b11d6", + "name": "picture", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "picture", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "picture", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "5c0dbd32-7a45-4dc9-9e4f-37570ebf5d38", + "name": "family name", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "lastName", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "family_name", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "2de0c290-124a-41be-b7d8-f61f63eed5ef", + "name": "full name", + "protocol": "openid-connect", + "protocolMapper": "oidc-full-name-mapper", + "consentRequired": false, + "config": { + "id.token.claim": "true", + "access.token.claim": "true", + "userinfo.token.claim": "true" + } + }, + { + "id": "369e67dd-fd5e-4d90-8d80-c945c7a0c049", + "name": "updated at", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "updatedAt", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "updated_at", + "jsonType.label": "long", + "userinfo.token.claim": "true" + } + }, + { + "id": "7557b943-11a1-42bb-a119-35e8da9fcb99", + "name": "birthdate", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "birthdate", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "birthdate", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "06359527-ce26-45f7-beba-7ccf5e71d6f5", + "name": "given name", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "firstName", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "given_name", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "8f3bfe54-a74a-4eed-b2bd-4157fc574b57", + "name": "middle name", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "middleName", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "middle_name", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "a6cbf817-a0f5-483d-ae1e-c716d04e1645", + "name": "website", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "website", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "website", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "1322fc37-04e4-4e89-99d4-6c304ad36c96", + "name": "zoneinfo", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "zoneinfo", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "zoneinfo", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + } + ] + }, + { + "id": "6aec68b8-7178-449d-9ba6-b6e1c2a9be73", + "name": "service_account", + "description": "Specific scope for a client enabled for service accounts", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "false", + "display.on.consent.screen": "false" + }, + "protocolMappers": [ + { + "id": "91715642-086a-493b-8f01-5c64d408b7e3", + "name": "Client ID", + "protocol": "openid-connect", + "protocolMapper": "oidc-usersessionmodel-note-mapper", + "consentRequired": false, + "config": { + "user.session.note": "client_id", + "introspection.token.claim": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "client_id", + "jsonType.label": "String" + } + }, + { + "id": "78dcf109-44bb-4aca-9540-a8896f26e864", + "name": "Client Host", + "protocol": "openid-connect", + "protocolMapper": "oidc-usersessionmodel-note-mapper", + "consentRequired": false, + "config": { + "user.session.note": "clientHost", + "introspection.token.claim": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "clientHost", + "jsonType.label": "String" + } + }, + { + "id": "e28a076d-9ee0-46ec-a2f0-a147bab66a09", + "name": "Client IP Address", + "protocol": "openid-connect", + "protocolMapper": "oidc-usersessionmodel-note-mapper", + "consentRequired": false, + "config": { + "user.session.note": "clientAddress", + "introspection.token.claim": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "clientAddress", + "jsonType.label": "String" + } + } + ] + }, + { + "id": "e728df12-1bff-418d-a68d-c2036d856db2", + "name": "roles", + "description": "OpenID Connect scope for add user roles to the access token", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "false", + "consent.screen.text": "${rolesScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "993f7f9d-55ba-4c1f-b84a-76e2c733bc94", + "name": "client roles", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-client-role-mapper", + "consentRequired": false, + "config": { + "user.attribute": "foo", + "access.token.claim": "true", + "claim.name": "resource_access.${client_id}.roles", + "jsonType.label": "String", + "multivalued": "true" + } + }, + { + "id": "f0b2b858-1cde-412b-a1c8-8ed3bd4e04d6", + "name": "realm roles", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-realm-role-mapper", + "consentRequired": false, + "config": { + "user.attribute": "foo", + "access.token.claim": "true", + "claim.name": "realm_access.roles", + "jsonType.label": "String", + "multivalued": "true" + } + }, + { + "id": "32ad3286-1486-4196-9232-533af4c10009", + "name": "audience resolve", + "protocol": "openid-connect", + "protocolMapper": "oidc-audience-resolve-mapper", + "consentRequired": false, + "config": {} + } + ] + }, + { + "id": "efee9fbd-1a06-41d4-94d1-16b59f8d9a68", + "name": "web-origins", + "description": "OpenID Connect scope for add allowed web origins to the access token", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "false", + "consent.screen.text": "", + "display.on.consent.screen": "false" + }, + "protocolMappers": [ + { + "id": "61110fbc-75c7-40cd-aca2-9b7a714b0b22", + "name": "allowed web origins", + "protocol": "openid-connect", + "protocolMapper": "oidc-allowed-origins-mapper", + "consentRequired": false, + "config": {} + } + ] + }, + { + "id": "4a0abefc-0423-403d-8383-10f989580c13", + "name": "phone", + "description": "OpenID Connect built-in scope: phone", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "true", + "consent.screen.text": "${phoneScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "acdce654-be20-4386-bd4f-edf2cd868f6b", + "name": "phone number", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "phoneNumber", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "phone_number", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "37082e43-4429-479d-bd80-7b8d11b17769", + "name": "phone number verified", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "phoneNumberVerified", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "phone_number_verified", + "jsonType.label": "boolean", + "userinfo.token.claim": "true" + } + } + ] + }, + { + "id": "1e5f680b-df5f-4d8c-b9c9-52b5445171ce", + "name": "acr", + "description": "OpenID Connect scope for add acr (authentication context class reference) to the token", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "false", + "display.on.consent.screen": "false" + }, + "protocolMappers": [ + { + "id": "590accb2-1b94-452e-bb20-51bc643fe860", + "name": "acr loa level", + "protocol": "openid-connect", + "protocolMapper": "oidc-acr-mapper", + "consentRequired": false, + "config": { + "id.token.claim": "true", + "access.token.claim": "true", + "userinfo.token.claim": "true" + } + } + ] + } + ], + "defaultDefaultClientScopes": [ + "role_list", + "profile", + "email", + "roles", + "web-origins", + "acr", + "basic" + ], + "defaultOptionalClientScopes": [ + "offline_access", + "address", + "phone", + "microprofile-jwt" + ], + "browserSecurityHeaders": { + "contentSecurityPolicyReportOnly": "", + "xContentTypeOptions": "nosniff", + "referrerPolicy": "no-referrer", + "xRobotsTag": "none", + "xFrameOptions": "SAMEORIGIN", + "contentSecurityPolicy": "frame-src 'self'; frame-ancestors 'self'; object-src 'none';", + "xXSSProtection": "1; mode=block", + "strictTransportSecurity": "max-age=31536000; includeSubDomains" + }, + "smtpServer": {}, + "eventsEnabled": false, + "eventsListeners": [ + "jboss-logging" + ], + "enabledEventTypes": [], + "adminEventsEnabled": false, + "adminEventsDetailsEnabled": false, + "identityProviders": [], + "identityProviderMappers": [], + "components": { + "org.keycloak.services.clientregistration.policy.ClientRegistrationPolicy": [ + { + "id": "76bd801e-c608-4338-8198-668c92446a35", + "name": "Full Scope Disabled", + "providerId": "scope", + "subType": "anonymous", + "subComponents": {}, + "config": {} + }, + { + "id": "06472a8f-7614-4022-b08e-62f023a5fe0a", + "name": "Allowed Client Scopes", + "providerId": "allowed-client-templates", + "subType": "anonymous", + "subComponents": {}, + "config": { + "allow-default-scopes": [ + "true" + ] + } + }, + { + "id": "3667ac91-1abf-4124-91e6-ffc803dc29aa", + "name": "Consent Required", + "providerId": "consent-required", + "subType": "anonymous", + "subComponents": {}, + "config": {} + }, + { + "id": "6e0c8a3f-b5f4-4a49-b44c-bde8ae314d89", + "name": "Max Clients Limit", + "providerId": "max-clients", + "subType": "anonymous", + "subComponents": {}, + "config": { + "max-clients": [ + "200" + ] + } + }, + { + "id": "62d78a88-78a2-4ea7-937b-9a062e946108", + "name": "Trusted Hosts", + "providerId": "trusted-hosts", + "subType": "anonymous", + "subComponents": {}, + "config": { + "host-sending-registration-request-must-match": [ + "true" + ], + "client-uris-must-match": [ + "true" + ] + } + }, + { + "id": "0ca9718d-bfca-4059-b7e8-e32ae3f70a7f", + "name": "Allowed Protocol Mapper Types", + "providerId": "allowed-protocol-mappers", + "subType": "authenticated", + "subComponents": {}, + "config": { + "allowed-protocol-mapper-types": [ + "oidc-address-mapper", + "saml-user-property-mapper", + "oidc-usermodel-attribute-mapper", + "oidc-usermodel-property-mapper", + "oidc-full-name-mapper", + "saml-role-list-mapper", + "saml-user-attribute-mapper", + "oidc-sha256-pairwise-sub-mapper" + ] + } + }, + { + "id": "9247c25c-ce3e-4858-8dda-b2c95b2f4d09", + "name": "Allowed Client Scopes", + "providerId": "allowed-client-templates", + "subType": "authenticated", + "subComponents": {}, + "config": { + "allow-default-scopes": [ + "true" + ] + } + }, + { + "id": "2d3e37a6-c167-4992-abf8-8cbe22f1bcb9", + "name": "Allowed Protocol Mapper Types", + "providerId": "allowed-protocol-mappers", + "subType": "anonymous", + "subComponents": {}, + "config": { + "allowed-protocol-mapper-types": [ + "saml-user-property-mapper", + "oidc-full-name-mapper", + "oidc-address-mapper", + "saml-role-list-mapper", + "oidc-usermodel-attribute-mapper", + "oidc-usermodel-property-mapper", + "oidc-sha256-pairwise-sub-mapper", + "saml-user-attribute-mapper" + ] + } + } + ], + "org.keycloak.userprofile.UserProfileProvider": [ + { + "id": "c12df2b1-cd7d-46b7-ba91-b4381a59f487", + "providerId": "declarative-user-profile", + "subComponents": {}, + "config": { + "kc.user.profile.config": [ + "{\"attributes\":[{\"name\":\"username\",\"displayName\":\"${username}\",\"validations\":{\"length\":{\"min\":3,\"max\":255},\"username-prohibited-characters\":{},\"up-username-not-idn-homograph\":{}},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false},{\"name\":\"email\",\"displayName\":\"${email}\",\"validations\":{\"email\":{},\"length\":{\"max\":255}},\"required\":{\"roles\":[\"user\"]},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false},{\"name\":\"firstName\",\"displayName\":\"${firstName}\",\"validations\":{\"length\":{\"max\":255},\"person-name-prohibited-characters\":{}},\"required\":{\"roles\":[\"user\"]},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false},{\"name\":\"lastName\",\"displayName\":\"${lastName}\",\"validations\":{\"length\":{\"max\":255},\"person-name-prohibited-characters\":{}},\"required\":{\"roles\":[\"user\"]},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false}],\"groups\":[{\"name\":\"user-metadata\",\"displayHeader\":\"User metadata\",\"displayDescription\":\"Attributes, which refer to user metadata\"}],\"unmanagedAttributePolicy\":\"ENABLED\"}" + ] + } + } + ], + "org.keycloak.keys.KeyProvider": [ + { + "id": "29577a17-9e8a-40cf-b804-cf36c2cf567c", + "name": "hmac-generated-hs512", + "providerId": "hmac-generated", + "subComponents": {}, + "config": { + "priority": [ + "100" + ], + "algorithm": [ + "HS512" + ] + } + }, + { + "id": "48051b03-e0a1-413d-af4a-d9c301f12662", + "name": "rsa-enc-generated", + "providerId": "rsa-enc-generated", + "subComponents": {}, + "config": { + "priority": [ + "100" + ], + "algorithm": [ + "RSA-OAEP" + ] + } + }, + { + "id": "04c1d0e1-6889-48d2-833a-449a2a9e6fe1", + "name": "hmac-generated", + "providerId": "hmac-generated", + "subComponents": {}, + "config": { + "priority": [ + "100" + ], + "algorithm": [ + "HS256" + ] + } + }, + { + "id": "500737be-f83b-4e67-954e-9e71ca7ed1b0", + "name": "rsa-generated", + "providerId": "rsa-generated", + "subComponents": {}, + "config": { + "priority": [ + "100" + ] + } + }, + { + "id": "7842aa88-a8fb-49a2-ac10-e437337e236a", + "name": "aes-generated", + "providerId": "aes-generated", + "subComponents": {}, + "config": { + "priority": [ + "100" + ] + } + } + ] + }, + "internationalizationEnabled": false, + "supportedLocales": [], + "authenticationFlows": [ + { + "id": "43f7c655-a9cd-4d53-8161-3b3d2008c126", + "alias": "Account verification options", + "description": "Method with which to verity the existing account", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "idp-email-verification", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "ALTERNATIVE", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "Verify Existing Account by Re-authentication", + "userSetupAllowed": false + } + ] + }, + { + "id": "0f5c2215-5f40-4509-bb6f-f28c9b743388", + "alias": "Browser - Conditional OTP", + "description": "Flow to determine if the OTP is required for the authentication", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "conditional-user-configured", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "auth-otp-form", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "eb66c86a-efdc-4039-9153-cd4708f39ba7", + "alias": "Direct Grant - Conditional OTP", + "description": "Flow to determine if the OTP is required for the authentication", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "conditional-user-configured", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "direct-grant-validate-otp", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "e68e679a-5fc1-427b-93c6-5657f3ff6eb1", + "alias": "First broker login - Conditional OTP", + "description": "Flow to determine if the OTP is required for the authentication", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "conditional-user-configured", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "auth-otp-form", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "e4a832f6-bae3-41c6-8198-5c14c6ddf706", + "alias": "Handle Existing Account", + "description": "Handle what to do if there is existing account with same email/username like authenticated identity provider", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "idp-confirm-link", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "Account verification options", + "userSetupAllowed": false + } + ] + }, + { + "id": "2bbaf432-1058-4ee4-a994-d87f1c224032", + "alias": "Reset - Conditional OTP", + "description": "Flow to determine if the OTP should be reset or not. Set to REQUIRED to force.", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "conditional-user-configured", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "reset-otp", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "352782b8-ddae-4ddc-af19-86a2900ef1f9", + "alias": "User creation or linking", + "description": "Flow for the existing/non-existing user alternatives", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticatorConfig": "create unique user config", + "authenticator": "idp-create-user-if-unique", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "ALTERNATIVE", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "Handle Existing Account", + "userSetupAllowed": false + } + ] + }, + { + "id": "fdc0ecfb-67f8-4390-85a0-50ecfdc66800", + "alias": "Verify Existing Account by Re-authentication", + "description": "Reauthentication of existing account", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "idp-username-password-form", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "CONDITIONAL", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "First broker login - Conditional OTP", + "userSetupAllowed": false + } + ] + }, + { + "id": "a656206c-59b9-47cf-8880-c0f04f04a0c3", + "alias": "browser", + "description": "browser based authentication", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "auth-cookie", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "auth-spnego", + "authenticatorFlow": false, + "requirement": "DISABLED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "identity-provider-redirector", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 25, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "ALTERNATIVE", + "priority": 30, + "autheticatorFlow": true, + "flowAlias": "forms", + "userSetupAllowed": false + } + ] + }, + { + "id": "7616793a-19e4-4d97-b7ae-ab962acaf444", + "alias": "clients", + "description": "Base authentication for clients", + "providerId": "client-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "client-secret", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "client-jwt", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "client-secret-jwt", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 30, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "client-x509", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 40, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "1f5446d7-d5de-47fb-8e15-347105d3d062", + "alias": "direct grant", + "description": "OpenID Connect Resource Owner Grant", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "direct-grant-validate-username", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "direct-grant-validate-password", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "CONDITIONAL", + "priority": 30, + "autheticatorFlow": true, + "flowAlias": "Direct Grant - Conditional OTP", + "userSetupAllowed": false + } + ] + }, + { + "id": "a55463dd-3ced-4102-a263-c121db059379", + "alias": "docker auth", + "description": "Used by Docker clients to authenticate against the IDP", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "docker-http-basic-authenticator", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "646a12ee-99e7-41cd-a1ea-3ed5e5a96dcf", + "alias": "first broker login", + "description": "Actions taken after first broker login with identity provider account, which is not yet linked to any Keycloak account", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticatorConfig": "review profile config", + "authenticator": "idp-review-profile", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "User creation or linking", + "userSetupAllowed": false + } + ] + }, + { + "id": "03f283e4-7b80-4b38-b90d-33ba8b0a07c3", + "alias": "forms", + "description": "Username, password, otp and other auth forms.", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "auth-username-password-form", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "CONDITIONAL", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "Browser - Conditional OTP", + "userSetupAllowed": false + } + ] + }, + { + "id": "047f04f4-b2c9-4aa9-bc38-4ed2c17d3e2c", + "alias": "registration", + "description": "registration flow", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "registration-page-form", + "authenticatorFlow": true, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": true, + "flowAlias": "registration form", + "userSetupAllowed": false + } + ] + }, + { + "id": "51cfacd6-9ee8-4fb2-a3fe-9e00246d9877", + "alias": "registration form", + "description": "registration form", + "providerId": "form-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "registration-user-creation", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "registration-password-action", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 50, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "registration-recaptcha-action", + "authenticatorFlow": false, + "requirement": "DISABLED", + "priority": 60, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "28bb511d-c4ea-4bb8-805c-086eeaf7b239", + "alias": "reset credentials", + "description": "Reset credentials for a user if they forgot their password or something", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "reset-credentials-choose-user", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "reset-credential-email", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "reset-password", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 30, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "CONDITIONAL", + "priority": 40, + "autheticatorFlow": true, + "flowAlias": "Reset - Conditional OTP", + "userSetupAllowed": false + } + ] + }, + { + "id": "d0189a78-5979-47ce-8536-32c8f6dec1b6", + "alias": "saml ecp", + "description": "SAML ECP Profile Authentication Flow", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "http-basic-authenticator", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + } + ], + "authenticatorConfig": [ + { + "id": "09fd7502-4e05-437f-865a-221fa1297e67", + "alias": "create unique user config", + "config": { + "require.password.update.after.registration": "false" + } + }, + { + "id": "9abca294-1e03-418f-841c-18b00053f949", + "alias": "review profile config", + "config": { + "update.profile.on.first.login": "missing" + } + } + ], + "requiredActions": [ + { + "alias": "CONFIGURE_TOTP", + "name": "Configure OTP", + "providerId": "CONFIGURE_TOTP", + "enabled": true, + "defaultAction": false, + "priority": 10, + "config": {} + }, + { + "alias": "TERMS_AND_CONDITIONS", + "name": "Terms and Conditions", + "providerId": "TERMS_AND_CONDITIONS", + "enabled": false, + "defaultAction": false, + "priority": 20, + "config": {} + }, + { + "alias": "UPDATE_PASSWORD", + "name": "Update Password", + "providerId": "UPDATE_PASSWORD", + "enabled": true, + "defaultAction": false, + "priority": 30, + "config": {} + }, + { + "alias": "UPDATE_PROFILE", + "name": "Update Profile", + "providerId": "UPDATE_PROFILE", + "enabled": true, + "defaultAction": false, + "priority": 40, + "config": {} + }, + { + "alias": "VERIFY_EMAIL", + "name": "Verify Email", + "providerId": "VERIFY_EMAIL", + "enabled": true, + "defaultAction": false, + "priority": 50, + "config": {} + }, + { + "alias": "delete_account", + "name": "Delete Account", + "providerId": "delete_account", + "enabled": false, + "defaultAction": false, + "priority": 60, + "config": {} + }, + { + "alias": "webauthn-register", + "name": "Webauthn Register", + "providerId": "webauthn-register", + "enabled": true, + "defaultAction": false, + "priority": 70, + "config": {} + }, + { + "alias": "webauthn-register-passwordless", + "name": "Webauthn Register Passwordless", + "providerId": "webauthn-register-passwordless", + "enabled": true, + "defaultAction": false, + "priority": 80, + "config": {} + }, + { + "alias": "delete_credential", + "name": "Delete Credential", + "providerId": "delete_credential", + "enabled": true, + "defaultAction": false, + "priority": 100, + "config": {} + }, + { + "alias": "update_user_locale", + "name": "Update User Locale", + "providerId": "update_user_locale", + "enabled": true, + "defaultAction": false, + "priority": 1000, + "config": {} + } + ], + "browserFlow": "browser", + "registrationFlow": "registration", + "directGrantFlow": "direct grant", + "resetCredentialsFlow": "reset credentials", + "clientAuthenticationFlow": "clients", + "dockerAuthenticationFlow": "docker auth", + "firstBrokerLoginFlow": "first broker login", + "attributes": { + "cibaBackchannelTokenDeliveryMode": "poll", + "cibaExpiresIn": "120", + "cibaAuthRequestedUserHint": "login_hint", + "oauth2DeviceCodeLifespan": "600", + "clientOfflineSessionMaxLifespan": "0", + "oauth2DevicePollingInterval": "5", + "clientSessionIdleTimeout": "0", + "parRequestUriLifespan": "60", + "clientSessionMaxLifespan": "0", + "clientOfflineSessionIdleTimeout": "0", + "cibaInterval": "5", + "realmReusableOtpCode": "false" + }, + "keycloakVersion": "26.1.1", + "userManagedAccessAllowed": false, + "organizationsEnabled": false, + "verifiableCredentialsEnabled": false, + "adminPermissionsEnabled": false, + "clientProfiles": { + "profiles": [] + }, + "clientPolicies": { + "policies": [] + } +} From 0d3ac8e9642c10cf843f2b397e27b73071e64abc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Thu, 12 Feb 2026 17:50:56 +0100 Subject: [PATCH 18/37] - bug fixes --- .../002-setup/05-deploy-osmo-control-plane.sh | 48 +++++++++++++++++- .../002-setup/06-deploy-osmo-backend.sh | 42 +++++++--------- .../cleanup/uninstall-osmo-backend.sh | 49 ++++++++++++++++--- 3 files changed, 105 insertions(+), 34 deletions(-) diff --git a/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh index 147967a69..d0480abf2 100755 --- a/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh +++ b/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh @@ -430,6 +430,24 @@ meks: log_success "MEK secrets created" +# If the MEK was regenerated (new key), OSMO's encrypted signing keys in the DB +# will be invalid (encrypted with the old MEK). Delete stale service_auth so OSMO +# re-generates signing keys on next startup with the current MEK. +# This is safe on first deploy (no-op) and on re-runs (idempotent). +if [[ -n "${POSTGRES_HOST:-}" && -n "${POSTGRES_PASSWORD:-}" ]]; then + log_info "Clearing stale JWT signing keys (will be regenerated on startup)..." + kubectl run osmo-mek-key-reset --namespace "${OSMO_NAMESPACE}" \ + --image=postgres:16-alpine --restart=Never \ + --env="PGPASSWORD=${POSTGRES_PASSWORD}" \ + --command -- sh -c "psql -h '${POSTGRES_HOST}' -p '${POSTGRES_PORT:-5432}' \ + -U '${POSTGRES_USER:-osmo_admin}' -d osmo \ + -c \"DELETE FROM configs WHERE key = 'service_auth' AND type = 'SERVICE';\" 2>/dev/null || true" \ + 2>/dev/null || true + kubectl wait --for=condition=Ready pod/osmo-mek-key-reset -n "${OSMO_NAMESPACE}" --timeout=30s 2>/dev/null || true + sleep 5 + kubectl delete pod osmo-mek-key-reset -n "${OSMO_NAMESPACE}" --force 2>/dev/null || true +fi + # ----------------------------------------------------------------------------- # Step 3.5: Deploy Redis (Required for OSMO rate limiting) # ----------------------------------------------------------------------------- @@ -1144,10 +1162,14 @@ services: nginx.ingress.kubernetes.io/proxy-busy-buffers-size: "32k" nginx.ingress.kubernetes.io/large-client-header-buffers: "4 16k" # Authentication configuration + # NOTE: auth.enabled must be false — Envoy sidecars handle JWT/OAuth2 auth externally. + # Setting auth.enabled=true breaks OSMO's internal JWT signing (/api/auth/jwt/access_token) + # because the service_auth code path conflicts with external OIDC endpoints. + # The OIDC endpoints are still listed so the osmo CLI (osmo login) can discover them. $(if [[ "$AUTH_ENABLED" == "true" ]]; then cat </dev/null || echo "") OSMO_AGENT=$(kubectl get svc -n osmo osmo-agent -o jsonpath='{.metadata.name}' 2>/dev/null || echo "") - - if [[ -n "$OSMO_AGENT" ]]; then + + if [[ -n "$OSMO_AGENT_INTERNAL" ]]; then + OSMO_SERVICE_URL="http://osmo-agent-internal.osmo.svc.cluster.local:80" + log_success "In-cluster Agent URL (internal, no Envoy): ${OSMO_SERVICE_URL}" + elif [[ -n "$OSMO_AGENT" ]]; then OSMO_SERVICE_URL="http://osmo-agent.osmo.svc.cluster.local:80" log_success "In-cluster Agent URL: ${OSMO_SERVICE_URL}" else - # Fallback: try to detect from any osmo-agent service - OSMO_AGENT=$(kubectl get svc -n osmo -l app.kubernetes.io/name=agent -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") - if [[ -n "$OSMO_AGENT" ]]; then - OSMO_SERVICE_URL="http://${OSMO_AGENT}.osmo.svc.cluster.local:80" - log_success "In-cluster Agent URL: ${OSMO_SERVICE_URL}" - else - echo "" - log_error "Could not detect OSMO Agent service. Deploy OSMO first: ./05-deploy-osmo-control-plane.sh" - log_error "Note: Backend operators require osmo-agent service for WebSocket connections" - exit 1 - fi + echo "" + log_error "Could not detect OSMO Agent service. Deploy OSMO first: ./05-deploy-osmo-control-plane.sh" + log_error "Note: Backend operators require osmo-agent service for WebSocket connections" + exit 1 fi fi @@ -130,7 +128,7 @@ if [[ -z "${OSMO_SERVICE_TOKEN:-}" ]]; then # Detect if Keycloak auth is active KEYCLOAK_ENABLED="false" if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]] || kubectl get svc keycloak -n osmo &>/dev/null; then - if has_envoy_sidecar osmo "app.kubernetes.io/name=service"; then + if has_envoy_sidecar osmo "app=osmo-service"; then KEYCLOAK_ENABLED="true" fi fi @@ -142,15 +140,11 @@ if [[ -z "${OSMO_SERVICE_TOKEN:-}" ]]; then TOKEN_NAME="backend-token-$(date -u +%Y%m%d%H%M%S)" EXPIRY_DATE=$(date -u -d "+1 year" +%F 2>/dev/null || date -u -v+1y +%F 2>/dev/null || echo "2027-01-01") - TOKEN_RESPONSE=$(osmo_curl POST "http://localhost:8080/api/auth/access_token/service/${TOKEN_NAME}" \ - -d "{\"description\":\"Backend Operator Token\",\"expires_at\":\"${EXPIRY_DATE}\",\"roles\":[\"osmo-backend\"]}") - - OSMO_SERVICE_TOKEN=$(echo "$TOKEN_RESPONSE" | jq -r '.token // empty' 2>/dev/null || echo "") + TOKEN_RESPONSE=$(osmo_curl POST \ + "http://localhost:8080/api/auth/access_token/service/${TOKEN_NAME}?expires_at=${EXPIRY_DATE}&roles=osmo-backend") - if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then - # Fallback: try extracting from different response format - OSMO_SERVICE_TOKEN=$(echo "$TOKEN_RESPONSE" | jq -r '.access_token // empty' 2>/dev/null || echo "") - fi + # API returns the token as a plain JSON string (e.g. "abc123...") + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_RESPONSE" | jq -r '. // empty' 2>/dev/null || echo "") if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then log_error "Failed to create service token via API" diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh b/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh index 2db7ef8da..cce604c99 100755 --- a/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh +++ b/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh @@ -1,6 +1,7 @@ #!/bin/bash # -# Uninstall OSMO Backend +# Uninstall OSMO Backend Operator +# Reverses everything deployed by 06-deploy-osmo-backend.sh # set -e @@ -9,22 +10,54 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/../lib/common.sh" source "${SCRIPT_DIR}/../defaults.sh" +OSMO_OPERATOR_NAMESPACE="osmo-operator" +OSMO_WORKFLOWS_NAMESPACE="osmo-workflows" + echo "" echo "========================================" -echo " Uninstalling OSMO Backend" +echo " Uninstalling OSMO Backend Operator" echo "========================================" echo "" -log_warning "This will remove OSMO Backend services" +log_warning "This will remove:" +echo " - Helm release: osmo-operator (namespace: ${OSMO_OPERATOR_NAMESPACE})" +echo " - Secret: osmo-operator-token (namespace: ${OSMO_OPERATOR_NAMESPACE})" +echo " - Namespace: ${OSMO_OPERATOR_NAMESPACE}" +echo " - Namespace: ${OSMO_WORKFLOWS_NAMESPACE} (and all workflow pods)" +echo "" read_prompt_var "Continue? (y/N)" confirm "" if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then log_info "Cancelled" exit 0 fi -log_info "Removing OSMO Backend..." -kubectl delete deployment osmo-backend -n "${OSMO_NAMESPACE}" --ignore-not-found -kubectl delete service osmo-backend -n "${OSMO_NAMESPACE}" --ignore-not-found -kubectl delete service osmo-api -n "${OSMO_NAMESPACE}" --ignore-not-found +# Uninstall Helm release +if helm status osmo-operator -n "${OSMO_OPERATOR_NAMESPACE}" &>/dev/null; then + log_info "Uninstalling Helm release: osmo-operator..." + helm uninstall osmo-operator -n "${OSMO_OPERATOR_NAMESPACE}" --wait --timeout 5m + log_success "Helm release uninstalled" +else + log_info "Helm release osmo-operator not found — skipping" +fi + +# Delete secrets +log_info "Removing secrets..." +kubectl delete secret osmo-operator-token -n "${OSMO_OPERATOR_NAMESPACE}" --ignore-not-found + +# Delete the internal agent service (created by 05-deploy-osmo-control-plane.sh for backend operator) +log_info "Removing osmo-agent-internal service..." +kubectl delete svc osmo-agent-internal -n "${OSMO_NAMESPACE}" --ignore-not-found -log_success "OSMO Backend uninstalled" +# Delete namespaces (this also removes any remaining resources inside them) +log_info "Deleting namespace: ${OSMO_WORKFLOWS_NAMESPACE}..." +kubectl delete namespace "${OSMO_WORKFLOWS_NAMESPACE}" --ignore-not-found --wait=false + +log_info "Deleting namespace: ${OSMO_OPERATOR_NAMESPACE}..." +kubectl delete namespace "${OSMO_OPERATOR_NAMESPACE}" --ignore-not-found --wait=false + +echo "" +log_success "OSMO Backend Operator uninstalled" +echo "" +echo "Note: Namespace deletion may continue in the background." +echo " kubectl get ns ${OSMO_OPERATOR_NAMESPACE} ${OSMO_WORKFLOWS_NAMESPACE} 2>/dev/null" +echo "" From b7ca0283b51928ecca20deae2a62690593331beb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Thu, 12 Feb 2026 18:10:08 +0100 Subject: [PATCH 19/37] - bug pools --- .../002-setup/09-configure-gpu-platform.sh | 78 ++++++++++++++----- .../osmo/deploy/002-setup/lib/common.sh | 7 ++ 2 files changed, 65 insertions(+), 20 deletions(-) diff --git a/applications/osmo/deploy/002-setup/09-configure-gpu-platform.sh b/applications/osmo/deploy/002-setup/09-configure-gpu-platform.sh index d18f15208..7d9268f17 100755 --- a/applications/osmo/deploy/002-setup/09-configure-gpu-platform.sh +++ b/applications/osmo/deploy/002-setup/09-configure-gpu-platform.sh @@ -6,8 +6,8 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" -OSMO_URL="${OSMO_URL:-http://localhost:8080}" OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" echo "" @@ -20,12 +20,11 @@ echo "" check_kubectl || exit 1 # ----------------------------------------------------------------------------- -# Start port-forward +# Start port-forward (auto-detects Envoy and bypasses if needed) # ----------------------------------------------------------------------------- log_info "Starting port-forward to OSMO service..." - -kubectl port-forward -n "${OSMO_NAMESPACE}" svc/osmo-service 8080:80 &>/dev/null & -PORT_FORWARD_PID=$! +start_osmo_port_forward "${OSMO_NAMESPACE}" 8080 +export _OSMO_PORT=8080 cleanup_port_forward() { if [[ -n "${PORT_FORWARD_PID:-}" ]]; then @@ -35,29 +34,68 @@ cleanup_port_forward() { } trap cleanup_port_forward EXIT -# Wait for port-forward to be ready +OSMO_URL="http://localhost:8080" + +# Wait for port-forward to be ready (reject 302 — that means Envoy redirect, not direct) log_info "Waiting for port-forward to be ready..." max_wait=30 elapsed=0 -while ! curl -s -o /dev/null -w "%{http_code}" "${OSMO_URL}/api/version" 2>/dev/null | grep -q "200\|401\|403"; do +while true; do + HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "${OSMO_URL}/api/version" 2>/dev/null || echo "000") + if [[ "$HTTP_CODE" == "200" ]]; then + break + fi sleep 1 ((elapsed += 1)) if [[ $elapsed -ge $max_wait ]]; then - log_error "Port-forward failed to start within ${max_wait}s" + log_error "Port-forward failed to start within ${max_wait}s (last HTTP: ${HTTP_CODE})" exit 1 fi done log_success "Port-forward ready" +# Login (no-op when bypassing Envoy) +osmo_login 8080 + +# ----------------------------------------------------------------------------- +# Step 0: Label nodes with OSMO pool/platform +# ----------------------------------------------------------------------------- +# OSMO discovers resources via node labels: +# osmo.nvidia.com/pool= — assigns node to a pool +# osmo.nvidia.com/platform= — assigns node to a platform within the pool +# GPU nodes get platform=gpu, CPU-only nodes get platform=default. +log_info "Labeling nodes with OSMO pool/platform..." + +NODE_COUNT=0 +GPU_NODE_COUNT=0 +for node in $(kubectl get nodes -o jsonpath='{.items[*].metadata.name}'); do + has_gpu=$(kubectl get node "$node" -o jsonpath='{.metadata.labels.nvidia\.com/gpu\.present}' 2>/dev/null) + gpu_count=$(kubectl get node "$node" -o jsonpath='{.status.allocatable.nvidia\.com/gpu}' 2>/dev/null) + + kubectl label node "$node" osmo.nvidia.com/pool=default --overwrite &>/dev/null + + if [[ "$has_gpu" == "true" ]] || [[ -n "$gpu_count" && "$gpu_count" -gt 0 ]] 2>/dev/null; then + kubectl label node "$node" osmo.nvidia.com/platform=gpu --overwrite &>/dev/null + ((GPU_NODE_COUNT++)) || true + else + kubectl label node "$node" osmo.nvidia.com/platform=default --overwrite &>/dev/null + fi + ((NODE_COUNT++)) || true +done + +log_success "Labeled ${NODE_COUNT} nodes (${GPU_NODE_COUNT} GPU, $((NODE_COUNT - GPU_NODE_COUNT)) CPU-only)" + +# Give the backend listener time to process node label changes +sleep 5 + # ----------------------------------------------------------------------------- # Step 1: Create GPU pod template # ----------------------------------------------------------------------------- log_info "Creating gpu_tolerations pod template..." -RESPONSE=$(curl -s -w "\n%{http_code}" -X PUT \ - "${OSMO_URL}/api/configs/pod_template/gpu_tolerations" \ - -H "Content-Type: application/json" \ - -d @"${SCRIPT_DIR}/gpu_pod_template.json") +RESPONSE=$(osmo_curl PUT "${OSMO_URL}/api/configs/pod_template/gpu_tolerations" \ + -d @"${SCRIPT_DIR}/gpu_pod_template.json" \ + -w "\n%{http_code}") HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') @@ -74,10 +112,9 @@ fi # ----------------------------------------------------------------------------- log_info "Creating gpu platform in default pool..." -RESPONSE=$(curl -s -w "\n%{http_code}" -X PUT \ - "${OSMO_URL}/api/configs/pool/default/platform/gpu" \ - -H "Content-Type: application/json" \ - -d @"${SCRIPT_DIR}/gpu_platform_update.json") +RESPONSE=$(osmo_curl PUT "${OSMO_URL}/api/configs/pool/default/platform/gpu" \ + -d @"${SCRIPT_DIR}/gpu_platform_update.json" \ + -w "\n%{http_code}") HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') @@ -96,11 +133,11 @@ log_info "Verifying configuration..." echo "" echo "Pod templates:" -curl -s "${OSMO_URL}/api/configs/pod_template" | jq 'keys' +osmo_curl GET "${OSMO_URL}/api/configs/pod_template" | jq 'keys' echo "" echo "GPU platform config:" -curl -s "${OSMO_URL}/api/configs/pool/default" | jq '.platforms.gpu' +osmo_curl GET "${OSMO_URL}/api/configs/pool/default" | jq '.platforms.gpu' # ----------------------------------------------------------------------------- # Step 4: Check GPU resources @@ -108,13 +145,13 @@ curl -s "${OSMO_URL}/api/configs/pool/default" | jq '.platforms.gpu' log_info "Checking GPU resources..." sleep 3 # Wait for backend to pick up changes -RESOURCE_COUNT=$(curl -s "${OSMO_URL}/api/resources" | jq '[.resources[] | select(.allocatable_fields.gpu != null)] | length') +RESOURCE_COUNT=$(osmo_curl GET "${OSMO_URL}/api/resources" | jq '[.resources[] | select(.allocatable_fields.gpu != null)] | length') echo "GPU nodes visible to OSMO: ${RESOURCE_COUNT}" if [[ "$RESOURCE_COUNT" -gt 0 ]]; then echo "" echo "GPU resources:" - curl -s "${OSMO_URL}/api/resources" | jq '.resources[] | select(.allocatable_fields.gpu != null) | {name: .name, gpu: .allocatable_fields.gpu, cpu: .allocatable_fields.cpu, memory: .allocatable_fields.memory}' + osmo_curl GET "${OSMO_URL}/api/resources" | jq '.resources[] | select(.allocatable_fields.gpu != null) | {name: .name, gpu: .allocatable_fields.gpu, cpu: .allocatable_fields.cpu, memory: .allocatable_fields.memory}' fi # ----------------------------------------------------------------------------- @@ -127,3 +164,4 @@ echo " osmo workflow submit workflows/osmo/gpu_test.yaml -p default" echo "" echo "Or test via curl:" echo " curl -X POST ${OSMO_URL}/api/workflow -H 'Content-Type: application/yaml' --data-binary @workflows/osmo/gpu_test.yaml" +echo "" diff --git a/applications/osmo/deploy/002-setup/lib/common.sh b/applications/osmo/deploy/002-setup/lib/common.sh index 60cb9505a..86baa93bc 100755 --- a/applications/osmo/deploy/002-setup/lib/common.sh +++ b/applications/osmo/deploy/002-setup/lib/common.sh @@ -289,6 +289,13 @@ start_osmo_port_forward() { local ns="${1:-osmo}" local local_port="${2:-8080}" + # Kill any stale port-forward on the target port (e.g. from a previous sourced run) + if command -v lsof &>/dev/null && lsof -ti :"$local_port" &>/dev/null; then + log_warning "Port ${local_port} already in use — killing stale process" + kill $(lsof -ti :"$local_port") 2>/dev/null || true + sleep 1 + fi + if has_envoy_sidecar "$ns" "app=osmo-service"; then log_info "Envoy sidecar detected — port-forwarding to pod:8000 (bypass Envoy)" local pod_name From 064feb3b65d21ed2c9f56cd731f2134a7a01ebb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Thu, 12 Feb 2026 22:24:38 +0100 Subject: [PATCH 20/37] - alternative deployment path --- .../osmo/deploy/002-setup/04-enable-tls.sh | 1 + .../002-setup/05-deploy-osmo-control-plane.sh | 71 +- .../deploy/002-setup/07-configure-storage.sh | 7 +- .../002-setup/08-configure-service-url.sh | 6 +- .../01-deploy-gpu-infrastructure.sh | 137 + .../002a-setup/02-deploy-observability.sh | 103 + .../002a-setup/03-deploy-nginx-ingress.sh | 89 + .../04-deploy-osmo-control-plane.sh | 1871 ++++++++++++ .../osmo/deploy/002a-setup/04-enable-tls.sh | 439 +++ .../002a-setup/05-deploy-osmo-backend.sh | 410 +++ .../deploy/002a-setup/06-configure-storage.sh | 244 ++ .../002a-setup/07-configure-service-url.sh | 149 + .../002a-setup/08-configure-gpu-platform.sh | 128 + applications/osmo/deploy/002a-setup/README.md | 363 +++ .../cleanup/uninstall-gpu-infrastructure.sh | 43 + .../002a-setup/cleanup/uninstall-keycloak.sh | 62 + .../cleanup/uninstall-nginx-ingress.sh | 20 + .../cleanup/uninstall-observability.sh | 76 + .../cleanup/uninstall-osmo-backend.sh | 63 + .../cleanup/uninstall-osmo-control-plane.sh | 34 + .../osmo/deploy/002a-setup/defaults.sh | 72 + .../002a-setup/gpu_platform_update.json | 14 + .../deploy/002a-setup/gpu_pod_template.json | 16 + .../osmo/deploy/002a-setup/lib/common.sh | 434 +++ .../deploy/002a-setup/osmo-values-noauth.yaml | 170 ++ .../deploy/002a-setup/sample_osmo_realm.json | 2636 +++++++++++++++++ .../002a-setup/values/gpu-operator.yaml | 57 + .../deploy/002a-setup/values/grafana.yaml | 70 + .../002a-setup/values/kai-scheduler.yaml | 13 + .../osmo/deploy/002a-setup/values/loki.yaml | 68 + .../002a-setup/values/network-operator.yaml | 62 + .../values/osmo-backend-operator.yaml | 37 + .../deploy/002a-setup/values/prometheus.yaml | 109 + .../deploy/002a-setup/values/promtail.yaml | 46 + 34 files changed, 8112 insertions(+), 8 deletions(-) create mode 100755 applications/osmo/deploy/002a-setup/01-deploy-gpu-infrastructure.sh create mode 100755 applications/osmo/deploy/002a-setup/02-deploy-observability.sh create mode 100755 applications/osmo/deploy/002a-setup/03-deploy-nginx-ingress.sh create mode 100755 applications/osmo/deploy/002a-setup/04-deploy-osmo-control-plane.sh create mode 100755 applications/osmo/deploy/002a-setup/04-enable-tls.sh create mode 100755 applications/osmo/deploy/002a-setup/05-deploy-osmo-backend.sh create mode 100755 applications/osmo/deploy/002a-setup/06-configure-storage.sh create mode 100755 applications/osmo/deploy/002a-setup/07-configure-service-url.sh create mode 100755 applications/osmo/deploy/002a-setup/08-configure-gpu-platform.sh create mode 100755 applications/osmo/deploy/002a-setup/README.md create mode 100755 applications/osmo/deploy/002a-setup/cleanup/uninstall-gpu-infrastructure.sh create mode 100755 applications/osmo/deploy/002a-setup/cleanup/uninstall-keycloak.sh create mode 100755 applications/osmo/deploy/002a-setup/cleanup/uninstall-nginx-ingress.sh create mode 100755 applications/osmo/deploy/002a-setup/cleanup/uninstall-observability.sh create mode 100755 applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-backend.sh create mode 100755 applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-control-plane.sh create mode 100755 applications/osmo/deploy/002a-setup/defaults.sh create mode 100755 applications/osmo/deploy/002a-setup/gpu_platform_update.json create mode 100755 applications/osmo/deploy/002a-setup/gpu_pod_template.json create mode 100755 applications/osmo/deploy/002a-setup/lib/common.sh create mode 100755 applications/osmo/deploy/002a-setup/osmo-values-noauth.yaml create mode 100755 applications/osmo/deploy/002a-setup/sample_osmo_realm.json create mode 100755 applications/osmo/deploy/002a-setup/values/gpu-operator.yaml create mode 100755 applications/osmo/deploy/002a-setup/values/grafana.yaml create mode 100755 applications/osmo/deploy/002a-setup/values/kai-scheduler.yaml create mode 100755 applications/osmo/deploy/002a-setup/values/loki.yaml create mode 100755 applications/osmo/deploy/002a-setup/values/network-operator.yaml create mode 100755 applications/osmo/deploy/002a-setup/values/osmo-backend-operator.yaml create mode 100755 applications/osmo/deploy/002a-setup/values/prometheus.yaml create mode 100755 applications/osmo/deploy/002a-setup/values/promtail.yaml diff --git a/applications/osmo/deploy/002-setup/04-enable-tls.sh b/applications/osmo/deploy/002-setup/04-enable-tls.sh index 22c8138a5..800b7ffb3 100755 --- a/applications/osmo/deploy/002-setup/04-enable-tls.sh +++ b/applications/osmo/deploy/002-setup/04-enable-tls.sh @@ -35,6 +35,7 @@ source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" HOSTNAME="${1:-${OSMO_INGRESS_HOSTNAME:-}}" +HOSTNAME="${HOSTNAME%.}" # Strip trailing dot (FQDN notation) TLS_SECRET="${OSMO_TLS_SECRET_NAME:-osmo-tls}" OSMO_NS="${OSMO_NAMESPACE:-osmo}" INGRESS_NS="${INGRESS_NAMESPACE:-ingress-nginx}" diff --git a/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh index d0480abf2..65307ce90 100755 --- a/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh +++ b/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh @@ -1162,10 +1162,12 @@ services: nginx.ingress.kubernetes.io/proxy-busy-buffers-size: "32k" nginx.ingress.kubernetes.io/large-client-header-buffers: "4 16k" # Authentication configuration - # NOTE: auth.enabled must be false — Envoy sidecars handle JWT/OAuth2 auth externally. - # Setting auth.enabled=true breaks OSMO's internal JWT signing (/api/auth/jwt/access_token) - # because the service_auth code path conflicts with external OIDC endpoints. - # The OIDC endpoints are still listed so the osmo CLI (osmo login) can discover them. + # NOTE: auth.enabled must be false even with Keycloak — when true, OSMO enforces + # auth on ALL API calls (403 Forbidden without x-osmo-user/JWT), which breaks the + # Web UI's server-side tRPC calls (Node.js → OSMO) that don't carry auth tokens. + # Instead, Envoy handles external auth, and the login_info endpoints are populated + # via a PATCH to /api/configs/service after deployment (see Step 12b below). + # /api/auth/keys still works with auth.enabled=false (RSA keys are auto-generated). $(if [[ "$AUTH_ENABLED" == "true" ]]; then cat <`. + if [[ "$AUTH_ENABLED" == "true" ]]; then + log_info "Populating /api/auth/login with Keycloak endpoints..." + KC_OIDC="${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect" + cat > /tmp/login_info_fix.json << LOGINEOF +{ + "service_auth": { + "login_info": { + "device_endpoint": "${KC_OIDC}/auth/device", + "device_client_id": "osmo-device", + "browser_endpoint": "${KC_OIDC}/auth", + "browser_client_id": "osmo-browser-flow", + "token_endpoint": "${KC_OIDC}/token", + "logout_endpoint": "${KC_OIDC}/logout" + } + } +} +LOGINEOF + if osmo_config_update SERVICE /tmp/login_info_fix.json "Populate auth/login endpoints"; then + log_success "/api/auth/login endpoints configured for Keycloak" + else + log_warning "Failed to populate /api/auth/login. CLI login may not auto-discover endpoints." + log_warning "Run: osmo login ${TARGET_SERVICE_URL} --method dev" + fi + rm -f /tmp/login_info_fix.json + fi else log_warning "Port-forward not ready. Run ./08-configure-service-url.sh manually." fi diff --git a/applications/osmo/deploy/002-setup/07-configure-storage.sh b/applications/osmo/deploy/002-setup/07-configure-storage.sh index 503c391eb..c677a5746 100755 --- a/applications/osmo/deploy/002-setup/07-configure-storage.sh +++ b/applications/osmo/deploy/002-setup/07-configure-storage.sh @@ -8,6 +8,7 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" echo "" echo "========================================" @@ -107,7 +108,11 @@ trap cleanup_port_forward EXIT log_info "Waiting for port-forward to be ready..." max_wait=30 elapsed=0 -while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; do +while true; do + HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null || echo "000") + if [[ "$HTTP_CODE" == "200" ]]; then + break + fi sleep 1 ((elapsed += 1)) if [[ $elapsed -ge $max_wait ]]; then diff --git a/applications/osmo/deploy/002-setup/08-configure-service-url.sh b/applications/osmo/deploy/002-setup/08-configure-service-url.sh index f240e4e52..3a0a24946 100755 --- a/applications/osmo/deploy/002-setup/08-configure-service-url.sh +++ b/applications/osmo/deploy/002-setup/08-configure-service-url.sh @@ -40,7 +40,11 @@ trap cleanup_port_forward EXIT log_info "Waiting for port-forward to be ready..." max_wait=30 elapsed=0 -while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; do +while true; do + HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null || echo "000") + if [[ "$HTTP_CODE" == "200" ]]; then + break + fi sleep 1 ((elapsed += 1)) if [[ $elapsed -ge $max_wait ]]; then diff --git a/applications/osmo/deploy/002a-setup/01-deploy-gpu-infrastructure.sh b/applications/osmo/deploy/002a-setup/01-deploy-gpu-infrastructure.sh new file mode 100755 index 000000000..ac6289b7d --- /dev/null +++ b/applications/osmo/deploy/002a-setup/01-deploy-gpu-infrastructure.sh @@ -0,0 +1,137 @@ +#!/bin/bash +# +# Deploy GPU Infrastructure (GPU Operator, Network Operator, KAI Scheduler) +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " GPU Infrastructure Deployment" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 +check_helm || exit 1 + +# Add Helm repos +log_info "Adding Helm repositories..." +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --force-update +helm repo update + +# Auto-detect driverfull images from Terraform config +if [[ -z "${USE_DRIVERFULL_IMAGES:-}" ]]; then + TF_DRIVERFULL=$(get_tf_output "gpu_nodes_driverfull_image" "../001-iac" || echo "") + if [[ "$TF_DRIVERFULL" == "true" ]]; then + USE_DRIVERFULL_IMAGES="true" + log_info "Auto-detected driverfull images from Terraform" + fi +fi + +# ----------------------------------------------------------------------------- +# Deploy GPU Operator (skipped when using driverfull images) +# ----------------------------------------------------------------------------- +if [[ "${USE_DRIVERFULL_IMAGES:-false}" == "true" ]]; then + log_info "Skipping GPU Operator (using Nebius driverfull images with pre-installed drivers)" + log_info "Installing NVIDIA device plugin for driverfull mode..." + + kubectl create namespace "${GPU_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + + # With driverfull images, we still need the GPU Operator for toolkit, device-plugin, + # dcgm, etc. - but driver installation is disabled. + helm upgrade --install gpu-operator nvidia/gpu-operator \ + --namespace "${GPU_OPERATOR_NAMESPACE}" \ + --values "${VALUES_DIR}/gpu-operator.yaml" \ + --set driver.enabled=false \ + --timeout 10m + + log_success "GPU Operator deployed (driver disabled - using driverfull images)" +else + log_info "Deploying NVIDIA GPU Operator (with driver installation)..." + + kubectl create namespace "${GPU_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + + helm upgrade --install gpu-operator nvidia/gpu-operator \ + --namespace "${GPU_OPERATOR_NAMESPACE}" \ + --values "${VALUES_DIR}/gpu-operator.yaml" \ + --timeout 10m + + log_success "GPU Operator deployed (pods will become ready when GPU nodes are available)" +fi + +# Brief wait for core operator pod only (not GPU node components) +sleep 10 +kubectl get pods -n "${GPU_OPERATOR_NAMESPACE}" --no-headers 2>/dev/null | head -5 || true + +# ----------------------------------------------------------------------------- +# Deploy Network Operator (for InfiniBand) - OPTIONAL +# ----------------------------------------------------------------------------- +if [[ "${ENABLE_NETWORK_OPERATOR:-false}" == "true" ]]; then + log_info "Deploying NVIDIA Network Operator (InfiniBand support)..." + + kubectl create namespace "${NETWORK_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + + helm upgrade --install network-operator nvidia/network-operator \ + --namespace "${NETWORK_OPERATOR_NAMESPACE}" \ + --values "${VALUES_DIR}/network-operator.yaml" \ + --timeout 10m + + log_success "Network Operator deployed" + + # Brief wait and show status + sleep 5 + kubectl get pods -n "${NETWORK_OPERATOR_NAMESPACE}" --no-headers 2>/dev/null | head -5 || true +else + log_info "Skipping Network Operator (set ENABLE_NETWORK_OPERATOR=true to install)" +fi + +# ----------------------------------------------------------------------------- +# Deploy KAI Scheduler (from NVIDIA OCI registry) +# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/dependencies/dependencies.html +# ----------------------------------------------------------------------------- +log_info "Deploying KAI Scheduler..." + +kubectl create namespace "${KAI_SCHEDULER_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + +# Install directly from OCI registry +KAI_VERSION="${KAI_SCHEDULER_VERSION:-0.4.0}" +helm upgrade --install kai-scheduler \ + oci://ghcr.io/nvidia/kai-scheduler/kai-scheduler \ + --version "${KAI_VERSION}" \ + --namespace "${KAI_SCHEDULER_NAMESPACE}" \ + --values "${VALUES_DIR}/kai-scheduler.yaml" \ + --timeout 5m + +log_success "KAI Scheduler deployed" + +# Brief wait and show status +sleep 5 +kubectl get pods -n "${KAI_SCHEDULER_NAMESPACE}" --no-headers 2>/dev/null | head -5 || true + +# ----------------------------------------------------------------------------- +# Verify Installation +# ----------------------------------------------------------------------------- +echo "" +log_info "Verifying GPU infrastructure..." + +# Check GPU nodes +GPU_NODES=$(kubectl get nodes -l node-type=gpu -o name 2>/dev/null | wc -l) +if [[ $GPU_NODES -gt 0 ]]; then + log_success "Found $GPU_NODES GPU node(s)" + kubectl get nodes -l node-type=gpu -o wide +else + log_warning "No GPU nodes found yet (they may still be provisioning)" +fi + +echo "" +echo "========================================" +log_success "GPU Infrastructure deployment complete!" +echo "========================================" +echo "" +echo "Next step: ./02-deploy-observability.sh" +echo "" diff --git a/applications/osmo/deploy/002a-setup/02-deploy-observability.sh b/applications/osmo/deploy/002a-setup/02-deploy-observability.sh new file mode 100755 index 000000000..cee09bac5 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/02-deploy-observability.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# +# Deploy Observability Stack (Prometheus, Grafana, Loki) +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " Observability Stack Deployment" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 +check_helm || exit 1 + +# Add Helm repos +log_info "Adding Helm repositories..." +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts --force-update +helm repo add grafana https://grafana.github.io/helm-charts --force-update +helm repo update + +# Create namespace +kubectl create namespace "${MONITORING_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + +# Generate Grafana password if not set +if [[ -z "$GRAFANA_ADMIN_PASSWORD" ]]; then + GRAFANA_ADMIN_PASSWORD=$(openssl rand -base64 16) + log_info "Generated Grafana admin password" +fi + +# ----------------------------------------------------------------------------- +# Deploy Prometheus +# ----------------------------------------------------------------------------- +log_info "Deploying Prometheus..." + +helm upgrade --install prometheus prometheus-community/kube-prometheus-stack \ + --namespace "${MONITORING_NAMESPACE}" \ + --values "${VALUES_DIR}/prometheus.yaml" \ + --set grafana.adminPassword="${GRAFANA_ADMIN_PASSWORD}" \ + --wait --timeout 10m + +log_success "Prometheus stack deployed" + +# ----------------------------------------------------------------------------- +# Deploy Loki +# ----------------------------------------------------------------------------- +log_info "Deploying Loki..." + +helm upgrade --install loki grafana/loki-stack \ + --namespace "${MONITORING_NAMESPACE}" \ + --values "${VALUES_DIR}/loki.yaml" \ + --wait --timeout 10m + +log_success "Loki deployed" + +# ----------------------------------------------------------------------------- +# Deploy Promtail +# ----------------------------------------------------------------------------- +log_info "Deploying Promtail..." + +helm upgrade --install promtail grafana/promtail \ + --namespace "${MONITORING_NAMESPACE}" \ + --values "${VALUES_DIR}/promtail.yaml" \ + --wait --timeout 5m + +log_success "Promtail deployed" + +# ----------------------------------------------------------------------------- +# Configure Grafana Datasources +# ----------------------------------------------------------------------------- +log_info "Configuring Grafana datasources..." + +# Loki datasource is auto-configured via values + +# Wait for Grafana +wait_for_pods "${MONITORING_NAMESPACE}" "app.kubernetes.io/name=grafana" 180 + +# ----------------------------------------------------------------------------- +# Output Access Information +# ----------------------------------------------------------------------------- +echo "" +echo "========================================" +log_success "Observability stack deployment complete!" +echo "========================================" +echo "" +echo "Access Grafana:" +echo " kubectl port-forward -n ${MONITORING_NAMESPACE} svc/prometheus-grafana 3000:80" +echo " URL: http://localhost:3000" +echo " Username: admin" +echo " Password: ${GRAFANA_ADMIN_PASSWORD}" +echo "" +echo "Access Prometheus:" +echo " kubectl port-forward -n ${MONITORING_NAMESPACE} svc/prometheus-kube-prometheus-prometheus 9090:9090" +echo " URL: http://localhost:9090" +echo "" +echo "Next step: ./03-deploy-nginx-ingress.sh" +echo "" diff --git a/applications/osmo/deploy/002a-setup/03-deploy-nginx-ingress.sh b/applications/osmo/deploy/002a-setup/03-deploy-nginx-ingress.sh new file mode 100755 index 000000000..5ecda68d3 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/03-deploy-nginx-ingress.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# +# Deploy NGINX Ingress Controller (community) +# Provides path-based routing for all OSMO services (API, router, Web UI). +# +# This installs the same controller OSMO uses elsewhere: +# - OSMO quick-start chart (Chart.yaml) depends on ingress-nginx from the same Helm repo. +# - OSMO Kind runner (run/start_service_kind.py) installs ingress-nginx the same way. +# We do not use the quick-start umbrella chart here (Nebius uses managed DB, etc.), +# so we install the controller explicitly. Not a duplicate of OSMO—same upstream chart. +# +# Run before 05-deploy-osmo-control-plane.sh. +# See: https://kubernetes.github.io/ingress-nginx/deploy/ + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}" +INGRESS_RELEASE_NAME="${INGRESS_RELEASE_NAME:-ingress-nginx}" + +echo "" +echo "========================================" +echo " NGINX Ingress Controller Deployment" +echo "========================================" +echo "" + +check_kubectl || exit 1 +check_helm || exit 1 + +# ----------------------------------------------------------------------------- +# Add Helm repo +# ----------------------------------------------------------------------------- +log_info "Adding ingress-nginx Helm repository..." +helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx --force-update +helm repo update + +# ----------------------------------------------------------------------------- +# Create namespace and install +# ----------------------------------------------------------------------------- +log_info "Creating namespace ${INGRESS_NAMESPACE}..." +kubectl create namespace "${INGRESS_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + +log_info "Installing NGINX Ingress Controller..." +# --set controller.progressDeadlineSeconds=600: chart v4.14+ defaults to 0 which +# K8s 1.32+ rejects ("must be greater than minReadySeconds"). Without this fix the +# Deployment is invalid, the controller never starts, and the admission webhook +# blocks all Ingress resource creation in downstream scripts. +helm upgrade --install "${INGRESS_RELEASE_NAME}" ingress-nginx/ingress-nginx \ + --namespace "${INGRESS_NAMESPACE}" \ + --set controller.service.type=LoadBalancer \ + --set controller.progressDeadlineSeconds=600 \ + --wait --timeout 5m || { + log_warning "Helm install returned non-zero; controller may still be starting." +} + +log_success "NGINX Ingress Controller deployed" + +# ----------------------------------------------------------------------------- +# Wait for LoadBalancer IP (optional; may take 1–2 min on cloud) +# ----------------------------------------------------------------------------- +log_info "Waiting for LoadBalancer IP (up to 120s)..." +for i in $(seq 1 24); do + LB_IP=$(kubectl get svc -n "${INGRESS_NAMESPACE}" -l app.kubernetes.io/name=ingress-nginx -o jsonpath='{.items[0].status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + if [[ -n "$LB_IP" ]]; then + log_success "LoadBalancer IP: ${LB_IP}" + echo "" + echo "OSMO will be accessible at:" + echo " http://${LB_IP}" + echo "" + echo "This URL is auto-detected by 05-deploy-osmo-control-plane.sh." + echo "" + break + fi + sleep 5 +done +if [[ -z "${LB_IP:-}" ]]; then + log_warning "LoadBalancer IP not yet assigned. Check: kubectl get svc -n ${INGRESS_NAMESPACE}" +fi + +echo "========================================" +log_success "NGINX Ingress deployment complete" +echo "========================================" +echo "" +echo "Next: run 04-enable-tls.sh (optional, recommended)" +echo " then 05-deploy-osmo-control-plane.sh" +echo "" diff --git a/applications/osmo/deploy/002a-setup/04-deploy-osmo-control-plane.sh b/applications/osmo/deploy/002a-setup/04-deploy-osmo-control-plane.sh new file mode 100755 index 000000000..a370f65c6 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/04-deploy-osmo-control-plane.sh @@ -0,0 +1,1871 @@ +#!/bin/bash +# +# Deploy OSMO Service (Control Plane) +# https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html +# +# Components: API Service, Router, Web UI, Worker, Logger, Agent, Keycloak +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " OSMO Service Deployment" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 +check_helm || exit 1 + +# ----------------------------------------------------------------------------- +# Configuration +# ----------------------------------------------------------------------------- +OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" +# Deploy Keycloak in same namespace as PostgreSQL to simplify DNS resolution +KEYCLOAK_NAMESPACE="${OSMO_NAMESPACE}" +OSMO_DOMAIN="${OSMO_DOMAIN:-osmo.local}" + +# Keycloak admin password - check for existing secret first to maintain consistency +if [[ -z "${KEYCLOAK_ADMIN_PASSWORD:-}" ]]; then + # Try to get existing password from secret + EXISTING_KC_PASS=$(kubectl get secret keycloak-admin-secret -n "${OSMO_NAMESPACE}" -o jsonpath='{.data.password}' 2>/dev/null | base64 -d 2>/dev/null || true) + if [[ -n "${EXISTING_KC_PASS}" ]]; then + KEYCLOAK_ADMIN_PASSWORD="${EXISTING_KC_PASS}" + log_info "Using existing Keycloak admin password from secret" + else + KEYCLOAK_ADMIN_PASSWORD="$(openssl rand -base64 12)" + log_info "Generated new Keycloak admin password" + fi +fi + +# ----------------------------------------------------------------------------- +# Get Database Configuration from Terraform (Nebius Managed PostgreSQL) +# ----------------------------------------------------------------------------- +log_info "Using Nebius Managed PostgreSQL..." + log_info "Retrieving database configuration..." + + # Get connection details from Terraform outputs + POSTGRES_HOST=$(get_tf_output "postgresql.host" "../001-iac" || echo "") + POSTGRES_PORT=$(get_tf_output "postgresql.port" "../001-iac" || echo "5432") + POSTGRES_DB=$(get_tf_output "postgresql.database" "../001-iac" || echo "osmo") + POSTGRES_USER=$(get_tf_output "postgresql.username" "../001-iac" || echo "osmo_admin") + + # Get password - try MysteryBox first, then Terraform output, then env vars + # MysteryBox secret ID is set by secrets-init.sh as TF_VAR_postgresql_mysterybox_secret_id + POSTGRES_SECRET_ID="${TF_VAR_postgresql_mysterybox_secret_id:-${OSMO_POSTGRESQL_SECRET_ID:-}}" + + if [[ -n "$POSTGRES_SECRET_ID" ]]; then + log_info "Reading PostgreSQL password from MysteryBox (secret: $POSTGRES_SECRET_ID)..." + POSTGRES_PASSWORD=$(get_mysterybox_secret "$POSTGRES_SECRET_ID" "password" || echo "") + if [[ -n "$POSTGRES_PASSWORD" ]]; then + log_success "PostgreSQL password retrieved from MysteryBox" + else + log_warning "Failed to read password from MysteryBox" + fi + fi + + # Fall back to Terraform output (only works if not using MysteryBox) + if [[ -z "$POSTGRES_PASSWORD" ]]; then + POSTGRES_PASSWORD=$(get_tf_output "postgresql_password" "../001-iac" || echo "") + fi + + # Fall back to environment variables or prompt + if [[ -z "$POSTGRES_HOST" || -z "$POSTGRES_PASSWORD" ]]; then + log_warning "Could not retrieve PostgreSQL configuration automatically" + log_info "Checking environment variables..." + + POSTGRES_HOST=${POSTGRES_HOST:-${OSMO_POSTGRES_HOST:-""}} + POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-${OSMO_POSTGRES_PASSWORD:-""}} + + if [[ -z "$POSTGRES_HOST" ]]; then + read_prompt_var "PostgreSQL Host" POSTGRES_HOST "" + fi + if [[ -z "$POSTGRES_PASSWORD" ]]; then + read_secret_var "PostgreSQL Password" POSTGRES_PASSWORD + fi + fi + +log_success "Database: ${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}" + +# ----------------------------------------------------------------------------- +# Get Storage Configuration +# ----------------------------------------------------------------------------- +log_info "Retrieving storage configuration..." + +S3_BUCKET=$(get_tf_output "storage_bucket.name" "../001-iac" || echo "") +S3_ENDPOINT=$(get_tf_output "storage_bucket.endpoint" "../001-iac" || echo "https://storage.eu-north1.nebius.cloud") +S3_ACCESS_KEY=$(get_tf_output "storage_credentials.access_key_id" "../001-iac" || echo "") + +# Secret access key is stored in MysteryBox (ephemeral, not in Terraform state) +S3_SECRET_REF_ID=$(get_tf_output "storage_secret_reference_id" "../001-iac" || echo "") +S3_SECRET_KEY="" + +if [[ -n "$S3_SECRET_REF_ID" ]]; then + log_info "Retrieving storage secret from MysteryBox..." + # IAM access key secrets are stored with key "secret" in MysteryBox + S3_SECRET_KEY=$(get_mysterybox_secret "$S3_SECRET_REF_ID" "secret" || echo "") + if [[ -n "$S3_SECRET_KEY" ]]; then + log_success "Storage secret retrieved from MysteryBox" + else + log_warning "Could not retrieve storage secret from MysteryBox" + fi +fi + +if [[ -n "$S3_BUCKET" ]]; then + log_success "Storage: ${S3_BUCKET} @ ${S3_ENDPOINT}" +fi + +# ----------------------------------------------------------------------------- +# Add Helm Repositories +# ----------------------------------------------------------------------------- +log_info "Adding Helm repositories..." +helm repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo --force-update +helm repo add bitnami https://charts.bitnami.com/bitnami --force-update +helm repo update + +# ----------------------------------------------------------------------------- +# Step 1: Create Namespaces +# ----------------------------------------------------------------------------- +log_info "Creating namespace..." +kubectl create namespace "${OSMO_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - +# Note: Keycloak is deployed in the same namespace as OSMO (no separate namespace needed) + +# ----------------------------------------------------------------------------- +# Step 2: Configure PostgreSQL - Verify Connection and Create Databases +# ----------------------------------------------------------------------------- +log_info "Verifying PostgreSQL connection..." + + # Delete any existing test/init pods + kubectl delete pod osmo-db-test -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null + kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null + + # Create a temporary secret with DB credentials + # NOTE: PGDATABASE must be the bootstrap database ('osmo') for Nebius MSP PostgreSQL + kubectl create secret generic osmo-db-init-creds \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=PGPASSWORD="${POSTGRES_PASSWORD}" \ + --from-literal=PGHOST="${POSTGRES_HOST}" \ + --from-literal=PGPORT="${POSTGRES_PORT}" \ + --from-literal=PGUSER="${POSTGRES_USER}" \ + --from-literal=PGDATABASE="${POSTGRES_DB}" \ + --dry-run=client -o yaml | kubectl apply -f - + + # ----------------------------------------------------------------------------- + # Connection Test - Verify credentials before proceeding + # ----------------------------------------------------------------------------- + log_info "Testing PostgreSQL connection (this may take a moment)..." + + kubectl run osmo-db-test \ + --namespace "${OSMO_NAMESPACE}" \ + --image=postgres:16-alpine \ + --restart=Never \ + --env="PGPASSWORD=${POSTGRES_PASSWORD}" \ + --env="PGHOST=${POSTGRES_HOST}" \ + --env="PGPORT=${POSTGRES_PORT}" \ + --env="PGUSER=${POSTGRES_USER}" \ + --env="PGDATABASE=${POSTGRES_DB}" \ + --command -- sh -c 'psql -c "SELECT 1" >/dev/null 2>&1 && echo "CONNECTION_OK" || echo "CONNECTION_FAILED"' \ + >/dev/null 2>&1 + + # Wait for test pod to complete + test_elapsed=0 + test_status="" + while [[ $test_elapsed -lt 60 ]]; do + test_status=$(kubectl get pod osmo-db-test -n "${OSMO_NAMESPACE}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Pending") + if [[ "$test_status" == "Succeeded" || "$test_status" == "Failed" ]]; then + break + fi + sleep 2 + ((test_elapsed += 2)) + done + + # Check test result + test_result=$(kubectl logs osmo-db-test -n "${OSMO_NAMESPACE}" 2>/dev/null || echo "") + kubectl delete pod osmo-db-test -n "${OSMO_NAMESPACE}" --ignore-not-found >/dev/null 2>&1 + + if [[ "$test_result" != *"CONNECTION_OK"* ]]; then + log_error "PostgreSQL connection test failed!" + echo "" + echo "Connection details:" + echo " Host: ${POSTGRES_HOST}" + echo " Port: ${POSTGRES_PORT}" + echo " Database: ${POSTGRES_DB}" + echo " User: ${POSTGRES_USER}" + echo " Password: (from MysteryBox secret ${TF_VAR_postgresql_mysterybox_secret_id:-'not set'})" + echo "" + echo "Possible causes:" + echo " 1. Password mismatch - MysteryBox password doesn't match PostgreSQL" + echo " Fix: Update MysteryBox or recreate PostgreSQL cluster" + echo " 2. Network issue - Cluster cannot reach PostgreSQL" + echo " 3. PostgreSQL not ready - Wait and retry" + echo "" + echo "To debug manually:" + echo " kubectl run psql-debug --rm -it --image=postgres:16-alpine -n osmo -- sh" + echo " PGPASSWORD='' psql -h ${POSTGRES_HOST} -U ${POSTGRES_USER} -d ${POSTGRES_DB}" + exit 1 + fi + + log_success "PostgreSQL connection verified" + + # ----------------------------------------------------------------------------- + # Database Creation + # ----------------------------------------------------------------------------- + if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + log_info "Creating OSMO and Keycloak databases (if not exist)..." + else + log_info "Verifying OSMO database..." + fi + + # NOTE: Nebius MSP PostgreSQL creates the bootstrap database ('osmo') automatically. + # The bootstrap user can only connect to this database, not 'postgres'. + # We connect to 'osmo' and create additional databases from there. + # Pass DEPLOY_KEYCLOAK to the init pod + kubectl apply -n "${OSMO_NAMESPACE}" -f - </dev/null 2>&1; then + echo "ERROR: Cannot connect to PostgreSQL" + echo "Debug: PGHOST=\$PGHOST, PGPORT=\$PGPORT, PGUSER=\$PGUSER, PGDATABASE=\${PGDATABASE:-osmo}" + # Try with verbose error + psql -d "\${PGDATABASE:-osmo}" -c "SELECT 1" 2>&1 || true + exit 1 + fi + echo "Connection successful to database '\${PGDATABASE:-osmo}'" + + # The 'osmo' database already exists (created by Nebius bootstrap) + echo "Database 'osmo' exists (created by Nebius MSP bootstrap)" + + # Create keycloak database only if Keycloak deployment is enabled + DEPLOY_KEYCLOAK="${DEPLOY_KEYCLOAK:-false}" + if [ "\$DEPLOY_KEYCLOAK" = "true" ]; then + # Note: This requires the user to have CREATEDB privilege + if psql -d "\${PGDATABASE:-osmo}" -tAc "SELECT 1 FROM pg_database WHERE datname='keycloak'" | grep -q 1; then + echo "Database 'keycloak' already exists" + else + echo "Creating database 'keycloak'..." + psql -d "\${PGDATABASE:-osmo}" -c "CREATE DATABASE keycloak;" || { + echo "WARNING: Could not create 'keycloak' database." + echo "The bootstrap user may not have CREATEDB privilege." + echo "Keycloak will use a schema in the 'osmo' database instead." + } + fi + fi + + # Verify databases exist + echo "" + echo "Verifying databases..." + psql -d "\${PGDATABASE:-osmo}" -c "\l" | grep -E "osmo" || true + + echo "" + echo "SUCCESS: Database initialization complete" + restartPolicy: Never +EOF + + # Wait for pod to complete (init pods may finish before Ready condition is detected) + log_info "Running database initialization..." + + # Poll for completion - init pods go directly to Completed/Succeeded very quickly + max_wait=120 + elapsed=0 + pod_status="" + + while [[ $elapsed -lt $max_wait ]]; do + pod_status=$(kubectl get pod osmo-db-init -n "${OSMO_NAMESPACE}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Pending") + + if [[ "$pod_status" == "Succeeded" ]]; then + break + elif [[ "$pod_status" == "Failed" ]]; then + log_error "Database initialization failed. Checking logs..." + kubectl logs osmo-db-init -n "${OSMO_NAMESPACE}" + kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found + exit 1 + fi + + sleep 2 + ((elapsed += 2)) + done + + if [[ "$pod_status" != "Succeeded" ]]; then + log_error "Database initialization timed out (status: $pod_status). Checking logs..." + kubectl logs osmo-db-init -n "${OSMO_NAMESPACE}" 2>/dev/null || true + kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found + exit 1 + fi + + # Show logs for verification + log_info "Database initialization output:" + kubectl logs osmo-db-init -n "${OSMO_NAMESPACE}" + + # Cleanup + kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found + +log_success "Databases verified and ready" + +# ----------------------------------------------------------------------------- +# Step 3: Create Secrets +# ----------------------------------------------------------------------------- +log_info "Creating secrets..." + +# keycloak-db-secret is created later in Step 4 when DEPLOY_KEYCLOAK=true (with other Keycloak secrets) + +# Create the postgres-secret that OSMO chart expects +# The chart looks for passwordSecretName: postgres-secret, passwordSecretKey: password +kubectl create secret generic postgres-secret \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=password="${POSTGRES_PASSWORD}" \ + --dry-run=client -o yaml | kubectl apply -f - + +# OIDC secrets (only needed if Keycloak is deployed) +# These are placeholder values that get overwritten with real Keycloak client secrets +if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + HMAC_SECRET=$(openssl rand -base64 32) + CLIENT_SECRET=$(openssl rand -base64 32) + kubectl create secret generic oidc-secrets \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=client_secret="${CLIENT_SECRET}" \ + --from-literal=hmac_secret="${HMAC_SECRET}" \ + --dry-run=client -o yaml | kubectl apply -f - +fi + +# Storage secret (if available) +if [[ -n "$S3_ACCESS_KEY" && -n "$S3_SECRET_KEY" ]]; then + kubectl create secret generic osmo-storage \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=access-key-id="${S3_ACCESS_KEY}" \ + --from-literal=secret-access-key="${S3_SECRET_KEY}" \ + --dry-run=client -o yaml | kubectl apply -f - +fi + +# MEK (Master Encryption Key) Configuration +# OSMO expects MEK in JWK (JSON Web Key) format, base64-encoded +# Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html +MEK_ID="${MEK_ID:-key1}" +log_info "Configuring MEK (Master Encryption Key)..." + +# Try to read MEK from MysteryBox first (set by secrets-init.sh) +# MysteryBox secret ID is set as TF_VAR_mek_mysterybox_secret_id +MEK_SECRET_ID="${TF_VAR_mek_mysterybox_secret_id:-${OSMO_MEK_SECRET_ID:-}}" +MEK_DATA="" + +if [[ -n "$MEK_SECRET_ID" ]]; then + log_info "Reading MEK from MysteryBox (secret: $MEK_SECRET_ID)..." + MEK_DATA=$(get_mysterybox_secret "$MEK_SECRET_ID" "mek" || echo "") + if [[ -n "$MEK_DATA" ]]; then + log_success "MEK retrieved from MysteryBox" + # MEK from secrets-init.sh is in format: {"currentMek":"key1","meks":{"key1":""}} + # Extract the key ID and encoded value + MEK_ID=$(echo "$MEK_DATA" | jq -r '.currentMek // "key1"' 2>/dev/null || echo "key1") + MEK_ENCODED=$(echo "$MEK_DATA" | jq -r ".meks.${MEK_ID} // empty" 2>/dev/null || echo "") + + if [[ -z "$MEK_ENCODED" ]]; then + log_warning "Could not parse MEK from MysteryBox, will generate new one" + MEK_DATA="" + fi + else + log_warning "Failed to read MEK from MysteryBox" + fi +fi + +# Generate new MEK if not retrieved from MysteryBox +if [[ -z "$MEK_DATA" || -z "$MEK_ENCODED" ]]; then + log_info "Generating new MEK in JWK format..." + MEK_KEY_RAW="$(openssl rand -base64 32 | tr -d '\n')" + MEK_JWK="{\"k\":\"${MEK_KEY_RAW}\",\"kid\":\"${MEK_ID}\",\"kty\":\"oct\"}" + MEK_ENCODED="$(echo -n "$MEK_JWK" | base64 | tr -d '\n')" + log_success "New MEK generated" +fi + +# Create MEK ConfigMap (OSMO expects ConfigMap, not Secret) +kubectl apply -n "${OSMO_NAMESPACE}" -f - </dev/null; then + log_info "Redis already deployed" +else + helm upgrade --install redis bitnami/redis \ + --namespace "${OSMO_NAMESPACE}" \ + --set architecture=standalone \ + --set auth.enabled=false \ + --set master.persistence.size=1Gi \ + --set master.resources.requests.cpu=100m \ + --set master.resources.requests.memory=128Mi \ + --wait --timeout 5m + + log_success "Redis deployed" +fi + +REDIS_HOST="redis-master.${OSMO_NAMESPACE}.svc.cluster.local" + +# ----------------------------------------------------------------------------- +# Step 4: Deploy Keycloak (Enable with DEPLOY_KEYCLOAK=true) +# ----------------------------------------------------------------------------- +# Keycloak provides authentication for OSMO +# Required for: osmo login, osmo token, backend operator +# Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak + +# Keycloak service URL (same namespace as OSMO) +KEYCLOAK_HOST="keycloak.${OSMO_NAMESPACE}.svc.cluster.local" +KEYCLOAK_URL="http://${KEYCLOAK_HOST}:80" + +# Derive Keycloak external hostname +# Priority: KEYCLOAK_HOSTNAME env var > auto-derive from OSMO_INGRESS_HOSTNAME +if [[ -n "${KEYCLOAK_HOSTNAME:-}" ]]; then + AUTH_DOMAIN="${KEYCLOAK_HOSTNAME}" +elif [[ -n "${OSMO_INGRESS_HOSTNAME:-}" ]]; then + AUTH_DOMAIN="auth-${OSMO_INGRESS_HOSTNAME}" +else + AUTH_DOMAIN="auth-${OSMO_DOMAIN}" +fi +KC_TLS_SECRET="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" + +if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + log_info "Deploying Keycloak for OSMO authentication..." + log_info "Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak" + + # Keycloak database was already created in Step 2 (osmo-db-init pod) when DEPLOY_KEYCLOAK=true + + # ------------------------------------------------------------------------- + # Step 1: Create secrets for Keycloak + # ------------------------------------------------------------------------- + log_info "Creating Keycloak secrets..." + + # Save admin password to secret for future re-runs + kubectl create secret generic keycloak-admin-secret \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=password="${KEYCLOAK_ADMIN_PASSWORD}" \ + --dry-run=client -o yaml | kubectl apply -f - + + # Create keycloak-db-secret for external database (per OSMO docs) + # Uses the managed PostgreSQL credentials + kubectl create secret generic keycloak-db-secret \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=postgres-password="${POSTGRES_PASSWORD}" \ + --dry-run=client -o yaml | kubectl apply -f - + + log_success "Keycloak secrets created" + + # ------------------------------------------------------------------------- + # Step 2: Install Keycloak using Bitnami Helm chart + # Per OSMO docs: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#install-keycloak-using-bitnami-helm-chart + # ------------------------------------------------------------------------- + log_info "Installing Keycloak using Bitnami Helm chart..." + + # Add Bitnami repo + helm repo add bitnami https://charts.bitnami.com/bitnami --force-update 2>/dev/null || true + helm repo update bitnami + + # Determine if Keycloak should use external TLS ingress + KC_EXTERNAL="false" + if [[ "${OSMO_TLS_ENABLED:-false}" == "true" && -n "${OSMO_INGRESS_HOSTNAME:-}" ]]; then + # Check TLS secret for auth domain exists + if kubectl get secret "${KC_TLS_SECRET}" -n "${OSMO_NAMESPACE}" &>/dev/null || \ + kubectl get secret "${KC_TLS_SECRET}" -n "${INGRESS_NAMESPACE:-ingress-nginx}" &>/dev/null; then + KC_EXTERNAL="true" + log_info "Keycloak will be exposed externally at: https://${AUTH_DOMAIN}" + else + log_warning "TLS secret '${KC_TLS_SECRET}' for Keycloak not found." + log_warning "Run: OSMO_INGRESS_HOSTNAME=${AUTH_DOMAIN} OSMO_TLS_SECRET_NAME=${KC_TLS_SECRET} ./03a-setup-tls-certificate.sh" + log_warning "Keycloak will be internal-only (port-forward access)" + fi + fi + + # Create keycloak-values.yaml per OSMO documentation + cat > /tmp/keycloak-values.yaml </dev/null || true + + # Install or upgrade Keycloak + # Note: Don't use --wait as it can hang; we'll check status separately + helm upgrade --install keycloak bitnami/keycloak \ + --namespace "${OSMO_NAMESPACE}" \ + -f /tmp/keycloak-values.yaml \ + --timeout 10m || { + log_warning "Helm install returned non-zero, checking pod status..." + } + + rm -f /tmp/keycloak-values.yaml + log_success "Keycloak Helm release installed" + + # Wait for Keycloak to be ready + log_info "Waiting for Keycloak to be ready (this may take 3-5 minutes)..." + + # Wait for the pod to exist first + for i in {1..30}; do + if kubectl get pods -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/name=keycloak 2>/dev/null | grep -q keycloak; then + break + fi + echo " Waiting for Keycloak pod to be created... ($i/30)" + sleep 5 + done + + # Now wait for it to be ready + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=keycloak \ + -n "${OSMO_NAMESPACE}" --timeout=300s || { + log_warning "Keycloak pod not ready yet, checking logs..." + kubectl logs -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/name=keycloak --tail=30 || true + } + + # Additional wait for Keycloak to fully initialize + log_info "Waiting for Keycloak to fully initialize..." + sleep 30 + + # Configure Keycloak realm using the official OSMO realm JSON + # Source: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#post-installation-keycloak-configuration + # The official sample_osmo_realm.json includes everything needed for OSMO RBAC: + # - Roles: osmo-user, osmo-admin, osmo-backend, grafana-*, dashboard-* + # - Groups: Admin, User, Backend Operator (with proper client-role mappings) + # - Clients: osmo-device (public, device code flow), osmo-browser-flow (confidential, auth code) + # - Mappers: "Create roles claim" protocol mapper on both clients (JWT roles claim) + # - Scopes: Standard OIDC scopes (profile, email, roles, etc.) + log_info "Configuring Keycloak realm using official OSMO realm JSON..." + + # Generate client secret for osmo-browser-flow (confidential client) + OIDC_CLIENT_SECRET=$(openssl rand -hex 16) + + # Determine OSMO base URL for client redirect URIs + if [[ "$KC_EXTERNAL" == "true" ]]; then + OSMO_BASE_URL="https://${OSMO_INGRESS_HOSTNAME}" + else + OSMO_BASE_URL="http://localhost:8080" + fi + + # Upload the official realm JSON as a ConfigMap (so the job can mount it) + log_info "Creating ConfigMap from sample_osmo_realm.json..." + kubectl create configmap keycloak-realm-json \ + --namespace "${OSMO_NAMESPACE}" \ + --from-file=realm.json="${SCRIPT_DIR}/sample_osmo_realm.json" \ + --dry-run=client -o yaml | kubectl apply -f - + + # Create a job to import the realm and configure a test user + cat > /tmp/keycloak-config-job.yaml < /dev/null 2>&1; then + echo "Keycloak is ready" + break + fi + echo " Attempt \$i: Keycloak not ready yet..." + sleep 15 + done + echo "" + + # ── Step 3: Get admin token ───────────────────────────── + echo "=== Step 3: Get admin token ===" + for i in 1 2 3 4 5; do + TOKEN=\$(curl -s -X POST "\${KEYCLOAK_URL}/realms/master/protocol/openid-connect/token" \ + --data-urlencode "client_id=admin-cli" \ + --data-urlencode "username=admin" \ + --data-urlencode "password=${KEYCLOAK_ADMIN_PASSWORD}" \ + --data-urlencode "grant_type=password" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4) + if [ -n "\$TOKEN" ]; then break; fi + echo " Retry \$i: waiting for token..." + sleep 10 + done + + if [ -z "\$TOKEN" ]; then + echo "FATAL: Failed to get admin token" + exit 1 + fi + echo "Got admin token" + echo "" + + # ── Step 4: Import OSMO realm ─────────────────────────── + echo "=== Step 4: Import OSMO realm ===" + + # Delete existing realm if present (idempotent re-runs) + REALM_STATUS=\$(curl -s -o /dev/null -w "%{http_code}" "\${KEYCLOAK_URL}/admin/realms/osmo" \ + -H "Authorization: Bearer \$TOKEN") + if [ "\$REALM_STATUS" = "200" ]; then + echo " Existing 'osmo' realm found – deleting for fresh import..." + curl -s -X DELETE "\${KEYCLOAK_URL}/admin/realms/osmo" \ + -H "Authorization: Bearer \$TOKEN" + echo " Old realm deleted" + sleep 5 + fi + + echo "Importing official OSMO realm from sample_osmo_realm.json..." + IMPORT_HTTP=\$(curl -s -o /tmp/import-resp.txt -w "%{http_code}" \ + -X POST "\${KEYCLOAK_URL}/admin/realms" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d @/tmp/realm-import.json) + + if [ "\$IMPORT_HTTP" = "201" ] || [ "\$IMPORT_HTTP" = "204" ]; then + echo "Realm imported successfully (HTTP \$IMPORT_HTTP)" + else + echo "WARNING: Realm import returned HTTP \$IMPORT_HTTP" + cat /tmp/import-resp.txt 2>/dev/null || true + echo "" + # Attempt partial import as fallback + echo "Trying partial import as fallback..." + curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/partialImport" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d @/tmp/realm-import.json || echo "Partial import also failed" + fi + + # Verify realm exists + sleep 3 + VERIFY=\$(curl -s -o /dev/null -w "%{http_code}" "\${KEYCLOAK_URL}/admin/realms/osmo" \ + -H "Authorization: Bearer \$TOKEN") + if [ "\$VERIFY" != "200" ]; then + echo "FATAL: Realm 'osmo' not found after import (HTTP \$VERIFY)" + exit 1 + fi + echo "Realm 'osmo' verified" + echo "" + + # ── Step 5: Create test user ──────────────────────────── + echo "=== Step 5: Create test user ===" + + # Refresh admin token (import may have taken a while) + TOKEN=\$(curl -s -X POST "\${KEYCLOAK_URL}/realms/master/protocol/openid-connect/token" \ + --data-urlencode "client_id=admin-cli" \ + --data-urlencode "username=admin" \ + --data-urlencode "password=${KEYCLOAK_ADMIN_PASSWORD}" \ + --data-urlencode "grant_type=password" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4) + + echo "Creating osmo-admin test user..." + curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/users" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "username": "osmo-admin", + "enabled": true, + "emailVerified": true, + "firstName": "OSMO", + "lastName": "Admin", + "email": "osmo-admin@example.com", + "credentials": [{"type":"password","value":"osmo-admin","temporary":false}] + }' || echo "User may already exist" + echo "" + + # ── Step 6: Assign user to Admin group ────────────────── + echo "=== Step 6: Assign user to Admin group ===" + + # Get user internal ID + USER_ID=\$(curl -s "\${KEYCLOAK_URL}/admin/realms/osmo/users?username=osmo-admin" \ + -H "Authorization: Bearer \$TOKEN" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4) + + if [ -n "\$USER_ID" ]; then + echo " User ID: \$USER_ID" + + # Get Admin group internal ID + ADMIN_GROUP_ID=\$(curl -s "\${KEYCLOAK_URL}/admin/realms/osmo/groups?search=Admin" \ + -H "Authorization: Bearer \$TOKEN" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4) + + if [ -n "\$ADMIN_GROUP_ID" ]; then + echo " Admin Group ID: \$ADMIN_GROUP_ID" + curl -s -X PUT "\${KEYCLOAK_URL}/admin/realms/osmo/users/\${USER_ID}/groups/\${ADMIN_GROUP_ID}" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d '{}' || echo "Failed to assign group" + echo " User 'osmo-admin' assigned to Admin group (osmo-admin + osmo-user roles)" + else + echo " WARNING: Admin group not found – user roles may need manual assignment" + fi + else + echo " WARNING: Could not find osmo-admin user ID" + fi + echo "" + + # ── Done ──────────────────────────────────────────────── + echo "=========================================" + echo " Keycloak OSMO Configuration Complete" + echo "=========================================" + echo "" + echo "Realm: osmo (imported from official sample_osmo_realm.json)" + echo "Clients: osmo-device (public, device code + direct access)" + echo " osmo-browser-flow (confidential, authorization code)" + echo "Groups: Admin, User, Backend Operator" + echo "Roles: osmo-admin, osmo-user, osmo-backend, grafana-*, dashboard-*" + echo "Mappers: JWT 'roles' claim configured on both clients" + echo "Test user: osmo-admin / osmo-admin (Admin group)" + echo "" +EOF + + # Delete any previous config job + kubectl delete job keycloak-osmo-setup -n "${KEYCLOAK_NAMESPACE}" --ignore-not-found 2>/dev/null || true + + kubectl apply -f /tmp/keycloak-config-job.yaml + + log_info "Waiting for Keycloak realm import job..." + kubectl wait --for=condition=complete job/keycloak-osmo-setup \ + -n "${KEYCLOAK_NAMESPACE}" --timeout=300s || { + log_warning "Keycloak configuration may have failed, check logs:" + kubectl logs -n "${KEYCLOAK_NAMESPACE}" -l job-name=keycloak-osmo-setup --tail=50 || true + } + + # Store the client secret for OIDC (used by Envoy sidecar) + kubectl create secret generic oidc-secrets \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=client_secret="${OIDC_CLIENT_SECRET}" \ + --from-literal=hmac_secret="$(openssl rand -base64 32)" \ + --dry-run=client -o yaml | kubectl apply -f - + + # Clean up temporary files and ConfigMap + rm -f /tmp/keycloak-values.yaml /tmp/keycloak-config-job.yaml + kubectl delete configmap keycloak-realm-json -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null || true + + log_success "Keycloak deployed and configured" + echo "" + if [[ "$KC_EXTERNAL" == "true" ]]; then + echo "Keycloak Access (external):" + echo " URL: https://${AUTH_DOMAIN}" + echo " Admin console: https://${AUTH_DOMAIN}/admin" + echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" + echo " Test User: osmo-admin / osmo-admin" + echo "" + echo "OSMO Auth Endpoints:" + echo " Token: https://${AUTH_DOMAIN}/realms/osmo/protocol/openid-connect/token" + echo " Auth: https://${AUTH_DOMAIN}/realms/osmo/protocol/openid-connect/auth" + echo "" + # Enable OSMO auth with Envoy sidecars (production mode) + AUTH_ENABLED="true" + KEYCLOAK_EXTERNAL_URL="https://${AUTH_DOMAIN}" + log_success "OSMO authentication will be ENABLED with Envoy sidecars" + else + echo "Keycloak Access (port-forward only):" + echo " kubectl port-forward -n ${KEYCLOAK_NAMESPACE} svc/keycloak 8081:80" + echo " URL: http://localhost:8081" + echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" + echo " Test User: osmo-admin / osmo-admin" + echo "" + echo "OSMO Auth Endpoints (in-cluster):" + echo " Token: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/token" + echo " Auth: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/auth" + echo "" + # Auth disabled when Keycloak is internal-only (no Envoy, open API) + AUTH_ENABLED="false" + KEYCLOAK_EXTERNAL_URL="" + log_info "Note: OSMO auth disabled (Keycloak is internal-only, no TLS ingress)" + log_info "To enable auth, set up TLS for the auth subdomain and re-run." + fi +else + log_info "Skipping Keycloak (set DEPLOY_KEYCLOAK=true to enable)" + log_warning "Without Keycloak, 'osmo login' and token creation will not work" + log_info "Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak" + AUTH_ENABLED="false" + KEYCLOAK_EXTERNAL_URL="" +fi + +# ----------------------------------------------------------------------------- +# Step 5: Create OSMO Values File +# ----------------------------------------------------------------------------- +log_info "Creating OSMO values file..." + +# NGINX Ingress – run 03-deploy-nginx-ingress.sh before this script +# When OSMO_INGRESS_HOSTNAME is empty (default), ingress matches any Host header, +# allowing direct IP-based access. Set it to a real domain for host-based routing. +INGRESS_HOSTNAME="${OSMO_INGRESS_HOSTNAME:-}" +TLS_ENABLED="${OSMO_TLS_ENABLED:-false}" +TLS_SECRET_NAME="${OSMO_TLS_SECRET_NAME:-osmo-tls}" +TLS_MODE="${OSMO_TLS_MODE:-}" + +if [[ -n "$INGRESS_HOSTNAME" ]]; then + log_info "Ingress hostname: ${INGRESS_HOSTNAME}" +else + log_info "Ingress hostname: (any — IP-based access)" +fi + +# TLS validation +if [[ "$TLS_ENABLED" == "true" ]]; then + log_info "TLS is ENABLED" + if [[ -z "$INGRESS_HOSTNAME" ]]; then + log_error "TLS is enabled but OSMO_INGRESS_HOSTNAME is not set." + echo " TLS certificates are issued for a domain name, not a bare IP." + echo " Set your domain: export OSMO_INGRESS_HOSTNAME=osmo.example.com" + exit 1 + fi + # Check that the TLS secret exists (created by 03a or 03c) + OSMO_NS_CHECK="${OSMO_NAMESPACE:-osmo}" + INGRESS_NS_CHECK="${INGRESS_NAMESPACE:-ingress-nginx}" + TLS_SECRET_FOUND="false" + if kubectl get secret "${TLS_SECRET_NAME}" -n "${OSMO_NS_CHECK}" &>/dev/null || \ + kubectl get secret "${TLS_SECRET_NAME}" -n "${INGRESS_NS_CHECK}" &>/dev/null; then + TLS_SECRET_FOUND="true" + fi + if [[ "$TLS_SECRET_FOUND" != "true" ]]; then + log_error "TLS secret '${TLS_SECRET_NAME}' not found." + echo " Run one of these scripts first to obtain a certificate:" + echo " ./03a-setup-tls-certificate.sh (manual certbot with DNS-01)" + echo " ./03c-deploy-cert-manager.sh (automated cert-manager with HTTP-01)" + exit 1 + fi + log_success "TLS secret '${TLS_SECRET_NAME}' found" +else + log_info "TLS is disabled (HTTP only). Set OSMO_TLS_ENABLED=true to enable." +fi + +# Create the values file with proper extraEnv and extraVolumes for each service +# This configures PostgreSQL password via env var and MEK via volume mount +cat > /tmp/osmo_values.yaml < Keycloak) + oauth2Filter: + enabled: true + tokenEndpoint: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/token + authEndpoint: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/auth + clientId: osmo-browser-flow + authProvider: ${AUTH_DOMAIN} + secretName: oidc-secrets + clientSecretKey: client_secret + hmacSecretKey: hmac_secret + + # JWT Filter config -- three providers + jwt: + user_header: x-osmo-user + providers: + # Provider 1: Keycloak device flow (CLI) + - issuer: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo + audience: osmo-device + jwks_uri: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/certs + user_claim: preferred_username + cluster: oauth + # Provider 2: Keycloak browser flow (Web UI) + - issuer: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo + audience: osmo-browser-flow + jwks_uri: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/certs + user_claim: preferred_username + cluster: oauth + # Provider 3: OSMO-signed JWTs (service accounts) + - issuer: osmo + audience: osmo + jwks_uri: http://localhost:8000/api/auth/keys + user_claim: unique_name + cluster: service +ENVOY_ENABLED +else +cat </dev/null || true + +log_info "Deploying OSMO Service..." + +SERVICE_HELM_ARGS=( + --namespace "${OSMO_NAMESPACE}" + -f /tmp/osmo_values.yaml +) +[[ -n "$INGRESS_HOSTNAME" ]] && SERVICE_HELM_ARGS+=(--set "services.service.hostname=${INGRESS_HOSTNAME}") + +helm upgrade --install osmo-service osmo/service \ + "${SERVICE_HELM_ARGS[@]}" \ + --wait --timeout 10m || { + log_warning "OSMO Service deployment had issues" + log_info "Checking pod status..." + kubectl get pods -n "${OSMO_NAMESPACE}" --no-headers | head -10 +} + +log_success "OSMO Service deployed" + +log_success "OSMO Service Helm deployment complete" + +# ----------------------------------------------------------------------------- +# Step 7: Deploy Router +# ----------------------------------------------------------------------------- +log_info "Deploying OSMO Router..." + +# Router requires configFile.enabled=true to mount the mek-config ConfigMap +# It also needs db-secret (not postgres-secret) for the password +kubectl create secret generic db-secret \ + --namespace "${OSMO_NAMESPACE}" \ + --from-literal=db-password="${POSTGRES_PASSWORD}" \ + --dry-run=client -o yaml | kubectl apply -f - + +ROUTER_HELM_ARGS=( + --namespace "${OSMO_NAMESPACE}" + --set service.type=ClusterIP + --set services.configFile.enabled=true + --set "services.postgres.serviceName=${POSTGRES_HOST}" + --set "services.postgres.port=${POSTGRES_PORT}" + --set services.postgres.db=osmo + --set "services.postgres.user=${POSTGRES_USER}" + --set services.service.ingress.enabled=true + --set services.service.ingress.ingressClass=nginx + --set "services.service.ingress.sslEnabled=${TLS_ENABLED}" + --set services.service.scaling.minReplicas=1 + --set services.service.scaling.maxReplicas=1 + --set sidecars.logAgent.enabled=false +) +[[ -n "$INGRESS_HOSTNAME" ]] && ROUTER_HELM_ARGS+=(--set "services.service.hostname=${INGRESS_HOSTNAME}" --set "global.domain=${INGRESS_HOSTNAME}") + +# Envoy sidecar config for Router +if [[ "$AUTH_ENABLED" == "true" ]]; then + log_info "Enabling Envoy sidecar on Router with Keycloak auth..." + ROUTER_HELM_ARGS+=( + --set sidecars.envoy.enabled=true + --set sidecars.envoy.useKubernetesSecrets=true + --set "sidecars.envoy.skipAuthPaths[0]=/api/router/version" + --set "sidecars.envoy.service.hostname=${INGRESS_HOSTNAME}" + # OAuth2 filter + --set sidecars.envoy.oauth2Filter.enabled=true + --set sidecars.envoy.oauth2Filter.forwardBearerToken=true + --set "sidecars.envoy.oauth2Filter.tokenEndpoint=${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/token" + --set "sidecars.envoy.oauth2Filter.authEndpoint=${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/auth" + --set sidecars.envoy.oauth2Filter.clientId=osmo-browser-flow + --set "sidecars.envoy.oauth2Filter.authProvider=${AUTH_DOMAIN}" + --set sidecars.envoy.oauth2Filter.redirectPath=api/auth/getAToken + --set sidecars.envoy.oauth2Filter.logoutPath=logout + --set sidecars.envoy.oauth2Filter.secretName=oidc-secrets + --set sidecars.envoy.oauth2Filter.clientSecretKey=client_secret + --set sidecars.envoy.oauth2Filter.hmacSecretKey=hmac_secret + # JWT filter + --set sidecars.envoy.jwt.enabled=true + --set sidecars.envoy.jwt.user_header=x-osmo-user + # JWT Provider 1: Keycloak device flow (CLI) + --set "sidecars.envoy.jwt.providers[0].issuer=${KEYCLOAK_EXTERNAL_URL}/realms/osmo" + --set "sidecars.envoy.jwt.providers[0].audience=osmo-device" + --set "sidecars.envoy.jwt.providers[0].jwks_uri=${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/certs" + --set "sidecars.envoy.jwt.providers[0].user_claim=preferred_username" + --set "sidecars.envoy.jwt.providers[0].cluster=oauth" + # JWT Provider 2: Keycloak browser flow (Web UI) + --set "sidecars.envoy.jwt.providers[1].issuer=${KEYCLOAK_EXTERNAL_URL}/realms/osmo" + --set "sidecars.envoy.jwt.providers[1].audience=osmo-browser-flow" + --set "sidecars.envoy.jwt.providers[1].jwks_uri=${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/certs" + --set "sidecars.envoy.jwt.providers[1].user_claim=preferred_username" + --set "sidecars.envoy.jwt.providers[1].cluster=oauth" + # JWT Provider 3: OSMO-signed JWTs (service accounts) + --set "sidecars.envoy.jwt.providers[2].issuer=osmo" + --set "sidecars.envoy.jwt.providers[2].audience=osmo" + --set "sidecars.envoy.jwt.providers[2].jwks_uri=http://osmo-service/api/auth/keys" + --set "sidecars.envoy.jwt.providers[2].user_claim=unique_name" + --set "sidecars.envoy.jwt.providers[2].cluster=osmoauth" + # osmoauth cluster (Router-specific: points to osmo-service) + --set sidecars.envoy.osmoauth.enabled=true + --set sidecars.envoy.osmoauth.port=80 + --set "sidecars.envoy.osmoauth.hostname=${INGRESS_HOSTNAME}" + --set sidecars.envoy.osmoauth.address=osmo-service + ) +else + ROUTER_HELM_ARGS+=(--set sidecars.envoy.enabled=false) +fi + +# Proxy buffer annotations for Router ingress (required for OAuth2 -- JWT cookies make headers large) +if [[ "$AUTH_ENABLED" == "true" ]]; then + ROUTER_HELM_ARGS+=( + --set-string "services.service.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-buffer-size=16k" + --set-string "services.service.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-buffers-number=4" + ) +fi + +# TLS settings for Router ingress +if [[ "$TLS_ENABLED" == "true" && -n "$INGRESS_HOSTNAME" ]]; then + ROUTER_HELM_ARGS+=( + --set-string "services.service.ingress.annotations.nginx\.ingress\.kubernetes\.io/ssl-redirect=true" + --set "services.service.ingress.tls[0].hosts[0]=${INGRESS_HOSTNAME}" + --set "services.service.ingress.tls[0].secretName=${TLS_SECRET_NAME}" + ) + if [[ "$TLS_MODE" == "cert-manager" ]]; then + ROUTER_HELM_ARGS+=( + --set-string "services.service.ingress.annotations.cert-manager\.io/cluster-issuer=${CLUSTER_ISSUER_NAME:-letsencrypt-prod}" + ) + fi +fi + +helm upgrade --install osmo-router osmo/router \ + "${ROUTER_HELM_ARGS[@]}" \ + --wait --timeout 5m || log_warning "Router deployment had issues" + +log_success "OSMO Router deployed" + +# ----------------------------------------------------------------------------- +# Step 8: Deploy Web UI (Optional) +# ----------------------------------------------------------------------------- +if [[ "${DEPLOY_UI:-true}" == "true" ]]; then + log_info "Deploying OSMO Web UI..." + + UI_HELM_ARGS=( + --namespace "${OSMO_NAMESPACE}" + --set services.ui.service.type=ClusterIP + --set services.ui.ingress.enabled=true + --set services.ui.ingress.ingressClass=nginx + --set "services.ui.ingress.sslEnabled=${TLS_ENABLED}" + --set services.ui.replicas=1 + --set "services.ui.apiHostname=osmo-service.${OSMO_NAMESPACE}.svc.cluster.local:80" + --set sidecars.logAgent.enabled=false + ) + [[ -n "$INGRESS_HOSTNAME" ]] && UI_HELM_ARGS+=(--set "services.ui.hostname=${INGRESS_HOSTNAME}" --set "global.domain=${INGRESS_HOSTNAME}") + + # Envoy sidecar config for Web UI + if [[ "$AUTH_ENABLED" == "true" ]]; then + log_info "Enabling Envoy sidecar on Web UI with Keycloak auth..." + UI_HELM_ARGS+=( + --set sidecars.envoy.enabled=true + --set sidecars.envoy.useKubernetesSecrets=true + --set "sidecars.envoy.service.hostname=${INGRESS_HOSTNAME}" + --set sidecars.envoy.service.address=127.0.0.1 + --set sidecars.envoy.service.port=8000 + # OAuth2 filter + --set sidecars.envoy.oauth2Filter.enabled=true + --set "sidecars.envoy.oauth2Filter.tokenEndpoint=${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/token" + --set "sidecars.envoy.oauth2Filter.authEndpoint=${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/auth" + --set sidecars.envoy.oauth2Filter.redirectPath=getAToken + --set sidecars.envoy.oauth2Filter.clientId=osmo-browser-flow + --set "sidecars.envoy.oauth2Filter.authProvider=${AUTH_DOMAIN}" + --set sidecars.envoy.oauth2Filter.logoutPath=logout + --set sidecars.envoy.oauth2Filter.secretName=oidc-secrets + --set sidecars.envoy.oauth2Filter.clientSecretKey=client_secret + --set sidecars.envoy.oauth2Filter.hmacSecretKey=hmac_secret + # JWT filter + --set sidecars.envoy.jwt.user_header=x-osmo-user + # JWT Provider 1: Keycloak device flow (CLI) + --set "sidecars.envoy.jwt.providers[0].issuer=${KEYCLOAK_EXTERNAL_URL}/realms/osmo" + --set "sidecars.envoy.jwt.providers[0].audience=osmo-device" + --set "sidecars.envoy.jwt.providers[0].jwks_uri=${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/certs" + --set "sidecars.envoy.jwt.providers[0].user_claim=preferred_username" + --set "sidecars.envoy.jwt.providers[0].cluster=oauth" + # JWT Provider 2: Keycloak browser flow (Web UI) + --set "sidecars.envoy.jwt.providers[1].issuer=${KEYCLOAK_EXTERNAL_URL}/realms/osmo" + --set "sidecars.envoy.jwt.providers[1].audience=osmo-browser-flow" + --set "sidecars.envoy.jwt.providers[1].jwks_uri=${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/certs" + --set "sidecars.envoy.jwt.providers[1].user_claim=preferred_username" + --set "sidecars.envoy.jwt.providers[1].cluster=oauth" + ) + else + UI_HELM_ARGS+=(--set sidecars.envoy.enabled=false) + fi + + # Proxy buffer annotations for Web UI ingress (required for OAuth2 -- JWT cookies make headers large) + if [[ "$AUTH_ENABLED" == "true" ]]; then + UI_HELM_ARGS+=( + --set-string "services.ui.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-buffer-size=16k" + --set-string "services.ui.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-buffers-number=4" + ) + fi + + # TLS settings for Web UI ingress + if [[ "$TLS_ENABLED" == "true" && -n "$INGRESS_HOSTNAME" ]]; then + UI_HELM_ARGS+=( + --set-string "services.ui.ingress.annotations.nginx\.ingress\.kubernetes\.io/ssl-redirect=true" + --set "services.ui.ingress.tls[0].hosts[0]=${INGRESS_HOSTNAME}" + --set "services.ui.ingress.tls[0].secretName=${TLS_SECRET_NAME}" + ) + if [[ "$TLS_MODE" == "cert-manager" ]]; then + UI_HELM_ARGS+=( + --set-string "services.ui.ingress.annotations.cert-manager\.io/cluster-issuer=${CLUSTER_ISSUER_NAME:-letsencrypt-prod}" + ) + fi + fi + + helm upgrade --install osmo-ui osmo/web-ui \ + "${UI_HELM_ARGS[@]}" \ + --wait --timeout 5m || log_warning "UI deployment had issues" + + log_success "OSMO Web UI deployed" +fi + +# Cleanup temp files +rm -f /tmp/osmo_values.yaml + +# ----------------------------------------------------------------------------- +# Step 9: Patch Deployments to Add vault-secrets Volume +# ----------------------------------------------------------------------------- +# NOTE: The Helm chart's extraVolumes/extraVolumeMounts values don't work reliably. +# We must patch the deployments after Helm creates them to add the vault-secrets volume. +# This is a known workaround - the env vars work via extraEnv, but volumes don't. + +log_info "Patching OSMO deployments to add vault-secrets volume mount..." + +# Create the JSON patch file +cat > /tmp/vault-patch.json << 'PATCH_EOF' +[ + {"op": "add", "path": "/spec/template/spec/volumes/-", "value": {"name": "vault-secrets", "secret": {"secretName": "vault-secrets"}}}, + {"op": "add", "path": "/spec/template/spec/containers/0/volumeMounts/-", "value": {"name": "vault-secrets", "mountPath": "/home/osmo/vault-agent/secrets", "readOnly": true}} +] +PATCH_EOF + +# All OSMO deployments that need the vault-secrets volume for MEK +OSMO_DEPLOYMENTS="osmo-service osmo-worker osmo-agent osmo-logger osmo-delayed-job-monitor osmo-router" + +for deploy in $OSMO_DEPLOYMENTS; do + if kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" &>/dev/null; then + # Check if vault-secrets volume already exists + EXISTING_VOL=$(kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.spec.template.spec.volumes[*].name}' 2>/dev/null | grep -w "vault-secrets" || true) + + if [[ -z "$EXISTING_VOL" ]]; then + log_info " Patching $deploy to add vault-secrets volume..." + if kubectl patch deployment/$deploy -n "${OSMO_NAMESPACE}" --type=json --patch-file=/tmp/vault-patch.json; then + log_success " $deploy patched successfully" + else + log_warning " Failed to patch $deploy" + fi + else + log_info " $deploy already has vault-secrets volume, skipping" + fi + else + log_info " $deploy not found, skipping" + fi +done + +# Cleanup patch file +rm -f /tmp/vault-patch.json + +# Wait for rollouts to complete +log_info "Waiting for deployments to roll out with new configuration..." +for deploy in $OSMO_DEPLOYMENTS; do + if kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" &>/dev/null; then + kubectl rollout status deployment/$deploy -n "${OSMO_NAMESPACE}" --timeout=180s || \ + log_warning " Timeout waiting for $deploy rollout" + fi +done + +log_success "All OSMO deployments patched with vault-secrets volume" + +# ----------------------------------------------------------------------------- +# Step 10: Patch Services for Direct Access (without Envoy) +# ----------------------------------------------------------------------------- +# When Envoy sidecar is disabled, services need to target port 8000 directly +# instead of the 'envoy-http' named port which doesn't exist. +# When Envoy IS enabled, the 'envoy-http' targetPort is correct -- skip patching. + +if [[ "$AUTH_ENABLED" == "true" ]]; then + log_info "Envoy sidecar is ENABLED -- skipping targetPort patches (envoy-http is correct)" +else + log_info "Verifying service ports (Envoy disabled)..." + + OSMO_SERVICES="osmo-service osmo-router osmo-logger osmo-agent" + + for svc in $OSMO_SERVICES; do + if kubectl get svc "$svc" -n "${OSMO_NAMESPACE}" &>/dev/null; then + CURRENT_TARGET=$(kubectl get svc "$svc" -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.spec.ports[0].targetPort}' 2>/dev/null || echo "") + + if [[ "$CURRENT_TARGET" == "envoy-http" || "$CURRENT_TARGET" == "envoy" ]]; then + log_info " Patching $svc: targetPort envoy-http -> 8000" + kubectl patch svc "$svc" -n "${OSMO_NAMESPACE}" --type='json' \ + -p='[{"op": "replace", "path": "/spec/ports/0/targetPort", "value": 8000}]' || \ + log_warning " Failed to patch $svc" + else + log_info " $svc: targetPort = $CURRENT_TARGET (OK)" + fi + fi + done + + log_success "Service ports verified" +fi + +# ----------------------------------------------------------------------------- +# Step 11: Verify Deployment +# ----------------------------------------------------------------------------- +echo "" +log_info "Verifying deployment configuration..." + +# Verify vault-secrets volumes are mounted +echo "" +echo "Volume configuration verification:" +for deploy in $OSMO_DEPLOYMENTS; do + if kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" &>/dev/null; then + VOL_CHECK=$(kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.spec.template.spec.volumes[*].name}' 2>/dev/null | grep -w "vault-secrets" || echo "") + ENV_CHECK=$(kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.spec.template.spec.containers[0].env[*].name}' 2>/dev/null | grep -w "OSMO_POSTGRES_PASSWORD" || echo "") + + VOL_STATUS="✗" + ENV_STATUS="✗" + [[ -n "$VOL_CHECK" ]] && VOL_STATUS="✓" + [[ -n "$ENV_CHECK" ]] && ENV_STATUS="✓" + + echo " $deploy: vault-secrets=$VOL_STATUS, postgres_env=$ENV_STATUS" + fi +done + +echo "" +echo "Pods:" +kubectl get pods -n "${OSMO_NAMESPACE}" + +echo "" +echo "Services:" +kubectl get svc -n "${OSMO_NAMESPACE}" + +# ----------------------------------------------------------------------------- +# Step 12: Configure service_base_url (required for workflow execution) +# ----------------------------------------------------------------------------- +# The osmo-ctrl sidecar in every workflow pod needs service_base_url to +# stream logs, report task status, and refresh tokens. +# This is an application-level config that must be set via the OSMO API. + +echo "" +log_info "Configuring service_base_url for workflow execution..." + +# Detect target URL from Ingress +INGRESS_URL=$(detect_service_url 2>/dev/null || true) + +if [[ -n "${OSMO_INGRESS_BASE_URL:-}" ]]; then + TARGET_SERVICE_URL="${OSMO_INGRESS_BASE_URL}" + log_info "Using explicit Ingress base URL: ${TARGET_SERVICE_URL}" +elif [[ -n "$INGRESS_URL" ]]; then + TARGET_SERVICE_URL="${INGRESS_URL}" + log_info "Auto-detected service URL: ${TARGET_SERVICE_URL}" +else + log_warning "Could not detect Ingress URL. Skipping service_base_url configuration." + log_warning "Run ./07-configure-service-url.sh manually after verifying the Ingress." + TARGET_SERVICE_URL="" +fi + +if [[ -n "$TARGET_SERVICE_URL" ]]; then + # Start port-forward using the shared helper (auto-detects Envoy) + start_osmo_port_forward "${OSMO_NAMESPACE}" 8080 + _PF_PID=$PORT_FORWARD_PID + + _cleanup_pf() { + if [[ -n "${_PF_PID:-}" ]]; then + kill $_PF_PID 2>/dev/null || true + wait $_PF_PID 2>/dev/null || true + fi + } + + # Wait for port-forward to be ready + _pf_ready=false + for i in $(seq 1 30); do + if curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; then + _pf_ready=true + break + fi + sleep 1 + done + + if [[ "$_pf_ready" == "true" ]]; then + # Login (no-op when bypassing Envoy -- osmo_curl handles auth headers) + osmo_login 8080 || true + + # Check current value + CURRENT_SVC_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') + + if [[ "$CURRENT_SVC_URL" == "$TARGET_SERVICE_URL" ]]; then + log_success "service_base_url already configured: ${CURRENT_SVC_URL}" + else + if [[ -n "$CURRENT_SVC_URL" && "$CURRENT_SVC_URL" != "null" ]]; then + log_warning "Updating service_base_url from '${CURRENT_SVC_URL}' to '${TARGET_SERVICE_URL}'" + fi + + # Write config and use PATCH API + cat > /tmp/service_url_fix.json << SVCEOF +{ + "service_base_url": "${TARGET_SERVICE_URL}" +} +SVCEOF + if osmo_config_update SERVICE /tmp/service_url_fix.json "Set service_base_url for osmo-ctrl sidecar"; then + # Verify + NEW_SVC_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') + if [[ "$NEW_SVC_URL" == "$TARGET_SERVICE_URL" ]]; then + log_success "service_base_url configured: ${NEW_SVC_URL}" + else + log_warning "service_base_url verification failed. Run ./07-configure-service-url.sh manually." + fi + else + log_warning "Failed to set service_base_url. Run ./07-configure-service-url.sh manually." + fi + rm -f /tmp/service_url_fix.json + fi + else + log_warning "Port-forward not ready. Run ./07-configure-service-url.sh manually." + fi + + _cleanup_pf +fi + +echo "" +echo "========================================" +log_success "OSMO Control Plane deployment complete!" +echo "========================================" +echo "" + +if [[ "$AUTH_ENABLED" == "true" ]]; then + # --- Auth-enabled output --- + echo "Authentication: ENABLED (Keycloak + Envoy sidecars)" + echo "" + echo "Keycloak Admin Console:" + echo " URL: https://${AUTH_DOMAIN}/admin" + echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" + echo "" + echo "OSMO Access:" + if [[ -n "$INGRESS_URL" ]]; then + echo " OSMO API: ${INGRESS_URL}/api/version (unauthenticated -- skipAuthPath)" + echo " OSMO Web UI: ${INGRESS_URL} (redirects to Keycloak login)" + fi + echo "" + echo "Login methods:" + echo " Browser: Visit ${INGRESS_URL:-https://} -- you will be redirected to Keycloak" + echo " CLI: osmo login ${INGRESS_URL:-https://}" + echo " (Opens browser for device authorization flow)" + echo "" + echo "Test user: osmo-admin / osmo-admin" + echo "" + echo "Keycloak realm management (groups, roles, users):" + echo " https://nvidia.github.io/OSMO/main/deployment_guide/appendix/authentication/keycloak_setup.html" + echo "" +else + # --- No-auth output --- + if [[ -n "$INGRESS_URL" ]]; then + echo "OSMO Access (via NGINX Ingress LoadBalancer):" + echo " OSMO API: ${INGRESS_URL}/api/version" + echo " OSMO UI: ${INGRESS_URL}" + echo " OSMO CLI: osmo login ${INGRESS_URL} --method dev --username admin" + echo "" + else + log_warning "Could not detect Ingress LoadBalancer IP." + echo " Check: kubectl get svc -n ${INGRESS_NAMESPACE:-ingress-nginx}" + echo "" + echo " Fallback (port-forward):" + echo " kubectl port-forward -n ${OSMO_NAMESPACE} svc/osmo-service 8080:80" + echo " URL: http://localhost:8080" + echo "" + fi + + echo "NOTE: OSMO API authentication is DISABLED." + echo " The API is accessible without tokens." + echo " Set DEPLOY_KEYCLOAK=true with TLS to enable Keycloak + Envoy auth." + echo "" + echo "Test the API:" + if [[ -n "$INGRESS_URL" ]]; then + echo " curl ${INGRESS_URL}/api/version" + echo " curl ${INGRESS_URL}/api/workflow" + else + echo " curl http://localhost:8080/api/version" + echo " curl http://localhost:8080/api/workflow" + fi + echo "" + if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + echo "Keycloak Access (internal only, auth not enforced):" + echo " kubectl port-forward -n ${KEYCLOAK_NAMESPACE} svc/keycloak 8081:80" + echo " URL: http://localhost:8081" + echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" + echo " Test User: osmo-admin / osmo-admin" + echo "" + fi +fi + +echo "Ingress resources:" +kubectl get ingress -n "${OSMO_NAMESPACE}" 2>/dev/null || true +echo "" +echo "Next step - Deploy Backend Operator:" +echo " ./05-deploy-osmo-backend.sh" +echo "" diff --git a/applications/osmo/deploy/002a-setup/04-enable-tls.sh b/applications/osmo/deploy/002a-setup/04-enable-tls.sh new file mode 100755 index 000000000..800b7ffb3 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/04-enable-tls.sh @@ -0,0 +1,439 @@ +#!/bin/bash +# +# Enable TLS/HTTPS using cert-manager + Let's Encrypt +# +# Can be run at two points in the deployment flow: +# +# A) Right after 03-deploy-nginx-ingress.sh (RECOMMENDED): +# Installs cert-manager, issues the TLS certificate early. +# When 05-deploy-osmo-control-plane.sh runs later, it auto-detects the +# certificate and creates TLS-enabled Ingress resources from the start. +# +# B) After 05-deploy-osmo-control-plane.sh (retrofit existing deployment): +# Does everything in (A) plus patches existing OSMO Ingress resources +# and updates service_base_url to HTTPS. +# +# Prerequisites: +# 1. NGINX Ingress Controller deployed (03-deploy-nginx-ingress.sh) +# 2. A DNS A record pointing your domain to the LoadBalancer IP +# +# Usage: +# ./04-enable-tls.sh +# +# Example: +# ./04-enable-tls.sh vl51.eu-north1.osmo.nebius.cloud +# +# Optional environment variables: +# OSMO_TLS_EMAIL - Email for Let's Encrypt expiry notices (default: noreply@) +# OSMO_TLS_SECRET_NAME - K8s Secret name for certificate (default: osmo-tls) +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +HOSTNAME="${1:-${OSMO_INGRESS_HOSTNAME:-}}" +HOSTNAME="${HOSTNAME%.}" # Strip trailing dot (FQDN notation) +TLS_SECRET="${OSMO_TLS_SECRET_NAME:-osmo-tls}" +OSMO_NS="${OSMO_NAMESPACE:-osmo}" +INGRESS_NS="${INGRESS_NAMESPACE:-ingress-nginx}" + +echo "" +echo "========================================" +echo " Enable TLS/HTTPS" +echo "========================================" +echo "" + +# ----------------------------------------------------------------------------- +# Validate inputs +# ----------------------------------------------------------------------------- +if [[ -z "$HOSTNAME" ]]; then + log_error "Usage: $0 " + echo "" + echo "Example: $0 vl51.eu-north1.osmo.nebius.cloud" + echo "" + LB_IP=$(kubectl get svc -n "${INGRESS_NS}" ingress-nginx-controller \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + if [[ -n "$LB_IP" ]]; then + echo "Your LoadBalancer IP is: ${LB_IP}" + echo "Create a DNS A record pointing your domain to this IP, then re-run this script." + fi + exit 1 +fi + +check_kubectl || exit 1 +check_helm || exit 1 + +log_info "Hostname: ${HOSTNAME}" +log_info "TLS secret: ${TLS_SECRET}" + +# Keycloak auth subdomain support +DEPLOY_KEYCLOAK="${DEPLOY_KEYCLOAK:-false}" +KC_TLS_SECRET="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" +AUTH_HOSTNAME="" +if [[ "$DEPLOY_KEYCLOAK" == "true" ]]; then + if [[ -n "${KEYCLOAK_HOSTNAME:-}" ]]; then + AUTH_HOSTNAME="${KEYCLOAK_HOSTNAME}" + else + AUTH_HOSTNAME="auth.${HOSTNAME}" + fi + log_info "Keycloak auth hostname: ${AUTH_HOSTNAME}" + log_info "Keycloak TLS secret: ${KC_TLS_SECRET}" +fi + +# Get LoadBalancer IP +LB_IP=$(kubectl get svc -n "${INGRESS_NS}" ingress-nginx-controller \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + +# Prompt user to set up DNS records before proceeding +echo "" +echo "========================================" +echo " DNS Record Setup Required" +echo "========================================" +echo "" +if [[ -n "$LB_IP" ]]; then + echo "Create the following DNS A record(s) pointing to your LoadBalancer IP:" + echo "" + echo " ${HOSTNAME} -> ${LB_IP}" + if [[ -n "$AUTH_HOSTNAME" ]]; then + echo " ${AUTH_HOSTNAME} -> ${LB_IP}" + fi +else + echo "LoadBalancer IP not yet assigned. Check with:" + echo " kubectl get svc -n ${INGRESS_NS} ingress-nginx-controller" + echo "" + echo "Once the IP is available, create DNS A record(s) for:" + echo " ${HOSTNAME}" + if [[ -n "$AUTH_HOSTNAME" ]]; then + echo " ${AUTH_HOSTNAME}" + fi +fi +echo "" +echo "Let's Encrypt HTTP-01 challenges require DNS to resolve to the LoadBalancer." +echo "" +read_prompt_var "Press Enter once DNS records are configured (or type 'skip' to skip DNS check)" DNS_CONFIRM "" + +# Verify DNS resolves to the LoadBalancer IP +if [[ "$DNS_CONFIRM" != "skip" ]]; then + DNS_IP=$(dig +short "$HOSTNAME" 2>/dev/null | tail -1 || true) + + if [[ -n "$LB_IP" && -n "$DNS_IP" ]]; then + if [[ "$DNS_IP" == "$LB_IP" ]]; then + log_success "DNS check: ${HOSTNAME} -> ${DNS_IP} (matches LoadBalancer)" + else + log_warning "DNS mismatch: ${HOSTNAME} -> ${DNS_IP}, but LoadBalancer IP is ${LB_IP}" + log_warning "Let's Encrypt HTTP-01 challenge may fail if DNS doesn't point to the LoadBalancer." + fi + elif [[ -z "$DNS_IP" ]]; then + log_warning "Could not resolve ${HOSTNAME}. Make sure the DNS record exists." + fi + + if [[ -n "$AUTH_HOSTNAME" ]]; then + AUTH_DNS_IP=$(dig +short "$AUTH_HOSTNAME" 2>/dev/null | tail -1 || true) + if [[ -n "$LB_IP" && -n "$AUTH_DNS_IP" ]]; then + if [[ "$AUTH_DNS_IP" == "$LB_IP" ]]; then + log_success "DNS check: ${AUTH_HOSTNAME} -> ${AUTH_DNS_IP} (matches LoadBalancer)" + else + log_warning "DNS mismatch: ${AUTH_HOSTNAME} -> ${AUTH_DNS_IP}, but LoadBalancer IP is ${LB_IP}" + fi + elif [[ -z "$AUTH_DNS_IP" ]]; then + log_warning "Could not resolve ${AUTH_HOSTNAME}. Keycloak TLS cert may fail." + fi + fi +fi + +# Check if OSMO is already deployed (determines whether to patch Ingress / update config) +INGRESS_COUNT=$(kubectl get ingress -n "${OSMO_NS}" --no-headers 2>/dev/null | wc -l | tr -d ' ') +if [[ "$INGRESS_COUNT" -gt 0 ]]; then + log_info "Found ${INGRESS_COUNT} Ingress resource(s) in ${OSMO_NS} (will patch with TLS)" + OSMO_DEPLOYED="true" +else + log_info "No OSMO Ingress resources yet — preparing cert-manager and certificate" + log_info "Step 05 will auto-detect the TLS cert and create HTTPS Ingress" + OSMO_DEPLOYED="false" +fi + +# ----------------------------------------------------------------------------- +# Step 1: Install cert-manager +# ----------------------------------------------------------------------------- +log_info "Installing cert-manager..." +helm repo add jetstack https://charts.jetstack.io --force-update +helm repo update jetstack + +if helm status cert-manager -n cert-manager &>/dev/null; then + log_info "cert-manager already installed" +else + helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager --create-namespace \ + --set crds.enabled=true \ + --wait --timeout 5m +fi +log_success "cert-manager ready" + +# ----------------------------------------------------------------------------- +# Step 2: Create Let's Encrypt ClusterIssuer +# ----------------------------------------------------------------------------- +TLS_EMAIL="${OSMO_TLS_EMAIL:-noreply@${HOSTNAME#*.}}" +log_info "Creating Let's Encrypt ClusterIssuer (email: ${TLS_EMAIL})..." + +kubectl apply -f - </dev/null); do + ing_name="${ing#*/}" + CURRENT_HTTP=$(kubectl get "$ing" -n "${OSMO_NS}" -o jsonpath='{.spec.rules[0].http}') + + kubectl patch "$ing" -n "${OSMO_NS}" --type=merge -p "$(cat </dev/null || echo "") + if [[ "$CERT_READY" == "True" ]]; then + log_success "TLS certificate issued and ready" + break + fi + sleep 5 +done + +if [[ "$CERT_READY" != "True" ]]; then + log_warning "Certificate not ready yet. Checking status..." + kubectl describe certificate "${TLS_SECRET}" -n "${OSMO_NS}" 2>/dev/null | tail -10 + echo "" + log_info "It may take a few more minutes. Check with:" + echo " kubectl get certificate -n ${OSMO_NS}" + echo " kubectl describe challenge -n ${OSMO_NS}" +fi + +# ----------------------------------------------------------------------------- +# Step 4b: Issue TLS certificate for Keycloak auth subdomain (if DEPLOY_KEYCLOAK=true) +# ----------------------------------------------------------------------------- +if [[ -n "$AUTH_HOSTNAME" ]]; then + log_info "Issuing TLS certificate for Keycloak auth subdomain: ${AUTH_HOSTNAME}..." + + # Create bootstrap Ingress for auth subdomain (to trigger HTTP-01 challenge) + kubectl apply -f - </dev/null || echo "") + if [[ "$AUTH_CERT_READY" == "True" ]]; then + log_success "Auth TLS certificate issued and ready" + break + fi + sleep 5 + done + + if [[ "$AUTH_CERT_READY" != "True" ]]; then + log_warning "Auth certificate not ready yet. It may take a few more minutes." + log_info "Check with: kubectl get certificate ${KC_TLS_SECRET} -n ${OSMO_NS}" + fi + + # Clean up the bootstrap Ingress if Keycloak will create its own + if [[ "$OSMO_DEPLOYED" == "true" ]]; then + kubectl delete ingress osmo-tls-auth-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null + fi +fi + +# ----------------------------------------------------------------------------- +# Step 5: Update OSMO service_base_url to HTTPS (only if OSMO is deployed) +# ----------------------------------------------------------------------------- +if [[ "$OSMO_DEPLOYED" == "true" ]]; then + log_info "Updating OSMO service_base_url to https://${HOSTNAME}..." + + kubectl port-forward -n "${OSMO_NS}" svc/osmo-service 8080:80 &>/dev/null & + _PF_PID=$! + trap 'kill $_PF_PID 2>/dev/null; wait $_PF_PID 2>/dev/null' EXIT + + # Wait for port-forward + _pf_ready=false + for i in $(seq 1 15); do + if curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; then + _pf_ready=true + break + fi + sleep 1 + done + + if [[ "$_pf_ready" == "true" ]]; then + if osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then + cat > /tmp/service_url_tls.json </dev/null; then + NEW_URL=$(curl -s "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') + log_success "service_base_url updated to: ${NEW_URL}" + else + log_warning "Could not update service_base_url automatically." + log_info "Run: ./08-configure-service-url.sh https://${HOSTNAME}" + fi + rm -f /tmp/service_url_tls.json + else + log_warning "Could not login to OSMO API. Update service_base_url manually:" + log_info " ./08-configure-service-url.sh https://${HOSTNAME}" + fi + else + log_warning "Could not connect to OSMO API. Update service_base_url manually:" + log_info " ./08-configure-service-url.sh https://${HOSTNAME}" + fi +else + log_info "Skipping service_base_url update (OSMO not deployed yet)" + log_info "Step 05 will auto-detect TLS and use https:// for service_base_url" +fi + +# ----------------------------------------------------------------------------- +# Step 6: Clean up bootstrap Ingress (if OSMO was deployed after cert issued) +# ----------------------------------------------------------------------------- +if [[ "$OSMO_DEPLOYED" == "true" ]]; then + # Remove the bootstrap ingress if it exists (from a previous Mode A run) + kubectl delete ingress osmo-tls-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null +fi + +# ----------------------------------------------------------------------------- +# Done +# ----------------------------------------------------------------------------- +echo "" +echo "========================================" +log_success "TLS setup complete" +echo "========================================" +echo "" + +if [[ "$OSMO_DEPLOYED" == "true" ]]; then + echo "OSMO is now accessible at:" + echo " https://${HOSTNAME}" + echo " https://${HOSTNAME}/api/version" + echo "" + echo "CLI login:" + echo " osmo login https://${HOSTNAME} --method dev --username admin" +else + echo "TLS certificate prepared for: ${HOSTNAME}" + if [[ -n "$AUTH_HOSTNAME" ]]; then + echo "Auth TLS certificate prepared for: ${AUTH_HOSTNAME}" + fi + echo "" + echo "Next steps:" + echo " 1. Wait for certificate(s) to be ready: kubectl get certificate -n ${OSMO_NS}" + echo " 2. Deploy OSMO: ./05-deploy-osmo-control-plane.sh" + echo " (It will auto-detect the TLS cert and create HTTPS Ingress)" + if [[ -n "$AUTH_HOSTNAME" ]]; then + echo " 3. Deploy with Keycloak: DEPLOY_KEYCLOAK=true ./05-deploy-osmo-control-plane.sh" + echo " (Keycloak will be exposed at https://${AUTH_HOSTNAME})" + fi +fi +echo "" diff --git a/applications/osmo/deploy/002a-setup/05-deploy-osmo-backend.sh b/applications/osmo/deploy/002a-setup/05-deploy-osmo-backend.sh new file mode 100755 index 000000000..40a3cff5a --- /dev/null +++ b/applications/osmo/deploy/002a-setup/05-deploy-osmo-backend.sh @@ -0,0 +1,410 @@ +#!/bin/bash +# +# Deploy OSMO Backend Operator +# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/deploy_backend.html +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " OSMO Backend Operator Deployment" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 +check_helm || exit 1 + +# ----------------------------------------------------------------------------- +# Configuration +# ----------------------------------------------------------------------------- +OSMO_OPERATOR_NAMESPACE="osmo-operator" +OSMO_WORKFLOWS_NAMESPACE="osmo-workflows" +OSMO_IMAGE_TAG="${OSMO_IMAGE_TAG:-6.0.0}" +OSMO_CHART_VERSION="${OSMO_CHART_VERSION:-}" +BACKEND_NAME="${OSMO_BACKEND_NAME:-default}" + +# Check for OSMO Service URL (in-cluster URL for the backend operator pods) +# IMPORTANT: Backend operators connect via WebSocket to osmo-agent, NOT osmo-service! +# The osmo-service handles REST API, osmo-agent handles WebSocket connections for backends +if [[ -z "${OSMO_SERVICE_URL:-}" ]]; then + log_info "Auto-detecting in-cluster OSMO Agent URL..." + + # Backend operators MUST connect to osmo-agent for WebSocket connections + # The osmo-service WebSocket routes only exist in dev mode + OSMO_AGENT=$(kubectl get svc -n osmo osmo-agent -o jsonpath='{.metadata.name}' 2>/dev/null || echo "") + + if [[ -n "$OSMO_AGENT" ]]; then + OSMO_SERVICE_URL="http://osmo-agent.osmo.svc.cluster.local:80" + log_success "In-cluster Agent URL: ${OSMO_SERVICE_URL}" + else + # Fallback: try to detect from any osmo-agent service + OSMO_AGENT=$(kubectl get svc -n osmo -l app.kubernetes.io/name=agent -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [[ -n "$OSMO_AGENT" ]]; then + OSMO_SERVICE_URL="http://${OSMO_AGENT}.osmo.svc.cluster.local:80" + log_success "In-cluster Agent URL: ${OSMO_SERVICE_URL}" + else + echo "" + log_error "Could not detect OSMO Agent service. Deploy OSMO first: ./04-deploy-osmo-control-plane.sh" + log_error "Note: Backend operators require osmo-agent service for WebSocket connections" + exit 1 + fi + fi +fi + +# Check for OSMO Service Token +if [[ -z "${OSMO_SERVICE_TOKEN:-}" ]]; then + # First, ensure namespace exists so we can check for existing secret + kubectl create namespace "${OSMO_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - 2>/dev/null || true + + # Check if token secret already exists in cluster + EXISTING_TOKEN=$(kubectl get secret osmo-operator-token -n "${OSMO_OPERATOR_NAMESPACE}" -o jsonpath='{.data.token}' 2>/dev/null | base64 -d || echo "") + + if [[ -n "$EXISTING_TOKEN" ]]; then + log_info "Using existing token from secret osmo-operator-token" + OSMO_SERVICE_TOKEN="$EXISTING_TOKEN" + elif command -v osmo &>/dev/null; then + # Check if osmo CLI is already logged in (don't try to login with in-cluster URL) + log_info "Checking if OSMO CLI is already logged in..." + + # Try to generate token - this only works if CLI is already logged in + TOKEN_NAME="backend-token-$(date -u +%Y%m%d%H%M%S)" + EXPIRY_DATE=$(date -u -d "+1 year" +%F 2>/dev/null || date -u -v+1y +%F 2>/dev/null || echo "2027-01-01") + + TOKEN_JSON=$(osmo token set "$TOKEN_NAME" \ + --expires-at "$EXPIRY_DATE" \ + --description "Backend Operator Token" \ + --service --roles osmo-backend -t json 2>/dev/null || echo "") + + if [[ -n "$TOKEN_JSON" ]]; then + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_JSON" | jq -r '.token // empty' 2>/dev/null || echo "") + fi + + if [[ -n "$OSMO_SERVICE_TOKEN" ]]; then + log_success "Service token generated: $TOKEN_NAME (expires: $EXPIRY_DATE)" + fi + fi + + # If still no token, automatically create one using port-forward + if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then + log_info "No token found - automatically creating service token..." + + TOKEN_NAME="backend-token-$(date -u +%Y%m%d%H%M%S)" + EXPIRY_DATE=$(date -u -d "+1 year" +%F 2>/dev/null || date -u -v+1y +%F 2>/dev/null || echo "2027-01-01") + + # Cleanup function to kill port-forwards on exit + PF_PIDS=() + cleanup_port_forwards() { + for pid in "${PF_PIDS[@]}"; do + kill "$pid" 2>/dev/null || true + wait "$pid" 2>/dev/null || true + done + } + trap cleanup_port_forwards EXIT + + # Detect if Keycloak auth is enabled + KEYCLOAK_ENABLED="false" + if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then + KEYCLOAK_ENABLED="true" + elif kubectl get svc -n "${OSMO_NAMESPACE:-osmo}" keycloak &>/dev/null; then + KEYCLOAK_ENABLED="true" + fi + + if [[ "$KEYCLOAK_ENABLED" == "true" ]]; then + # --------------------------------------------------------------- + # Keycloak-enabled: use Resource Owner Password Grant to get JWT, + # then call OSMO REST API with Bearer token + # --------------------------------------------------------------- + log_info "Keycloak detected - using password grant for token creation..." + + # Derive Keycloak external URL from the ingress (ensures JWT issuer matches + # what Envoy expects -- using port-forward would produce a wrong issuer) + KC_INGRESS_HOST=$(kubectl get ingress -n "${OSMO_NAMESPACE:-osmo}" keycloak -o jsonpath='{.spec.rules[0].host}' 2>/dev/null || echo "") + if [[ -z "$KC_INGRESS_HOST" ]]; then + log_error "Could not detect Keycloak ingress hostname" + exit 1 + fi + KEYCLOAK_TOKEN_URL="https://${KC_INGRESS_HOST}/realms/osmo/protocol/openid-connect/token" + log_info "Keycloak token endpoint: ${KEYCLOAK_TOKEN_URL}" + + # Port-forward to OSMO service (for the token creation API) + log_info "Starting port-forward to OSMO service..." + kubectl port-forward -n "${OSMO_NAMESPACE:-osmo}" svc/osmo-service 8080:80 &>/dev/null & + PF_PIDS+=($!) + + # Wait for port-forward to be ready + log_info "Waiting for port-forward to be ready..." + max_wait=30 + elapsed=0 + while true; do + SVC_READY=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null || echo "000") + if [[ "$SVC_READY" =~ ^(200|401|403)$ ]]; then + break + fi + sleep 1 + elapsed=$((elapsed + 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s (service=$SVC_READY)" + exit 1 + fi + done + log_success "Port-forward ready" + + # Get Keycloak JWT via Resource Owner Password Grant + # Uses osmo-device client (public, directAccessGrantsEnabled=true) + # MUST use external Keycloak URL so the JWT issuer matches what Envoy expects + KC_ADMIN_USER="${OSMO_KC_ADMIN_USER:-osmo-admin}" + KC_ADMIN_PASS="${OSMO_KC_ADMIN_PASS:-osmo-admin}" + + log_info "Authenticating with Keycloak as '${KC_ADMIN_USER}'..." + KC_RESPONSE=$(curl -s -X POST "${KEYCLOAK_TOKEN_URL}" \ + -d "grant_type=password" \ + -d "client_id=osmo-device" \ + -d "username=${KC_ADMIN_USER}" \ + -d "password=${KC_ADMIN_PASS}") + + KC_JWT=$(echo "$KC_RESPONSE" | jq -r '.access_token // empty' 2>/dev/null || echo "") + if [[ -z "$KC_JWT" ]]; then + KC_ERROR=$(echo "$KC_RESPONSE" | jq -r '.error_description // .error // empty' 2>/dev/null || echo "unknown error") + log_error "Keycloak authentication failed: $KC_ERROR" + log_error "Ensure OSMO_KC_ADMIN_USER and OSMO_KC_ADMIN_PASS are set, or that osmo-admin/osmo-admin is valid" + exit 1 + fi + log_success "Keycloak authentication successful" + + # Create service token via OSMO REST API + # NOTE: Must use "x-osmo-auth" header (not Authorization), because: + # 1. Envoy's OAuth2 filter runs first and would redirect to Keycloak + # if it doesn't see OAuth cookies. The "x-osmo-auth" header triggers + # the pass_through_matcher, bypassing the OAuth2 redirect. + # 2. Envoy's JWT filter reads from "x-osmo-auth" (not Authorization). + # 3. No "Bearer " prefix -- the JWT filter has no value_prefix configured, + # so it expects the raw JWT directly. + log_info "Creating service token: $TOKEN_NAME (expires: $EXPIRY_DATE)..." + TOKEN_RESPONSE=$(curl -s -w "\n%{http_code}" -X POST \ + "http://localhost:8080/api/auth/access_token/service/${TOKEN_NAME}?expires_at=${EXPIRY_DATE}&roles=osmo-backend" \ + -H "x-osmo-auth: ${KC_JWT}" \ + -H "Content-Type: application/json") + + # Separate response body from HTTP status code + HTTP_CODE=$(echo "$TOKEN_RESPONSE" | tail -1) + TOKEN_BODY=$(echo "$TOKEN_RESPONSE" | sed '$d') + + if [[ "$HTTP_CODE" != "200" && "$HTTP_CODE" != "201" ]]; then + log_error "Token creation API returned HTTP $HTTP_CODE" + log_error "Response: $TOKEN_BODY" + exit 1 + fi + + # Response is the raw token string (quoted JSON string) + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_BODY" | jq -r '. // empty' 2>/dev/null || echo "") + # If jq fails (response might be a plain string, not JSON), use raw + if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_BODY" | tr -d '"' | tr -d '\r' | xargs) + fi + + else + # --------------------------------------------------------------- + # No Keycloak: use dev auth method (original approach) + # --------------------------------------------------------------- + # Check if osmo CLI is available + if ! command -v osmo &>/dev/null; then + log_error "osmo CLI not found. Please install it first." + exit 1 + fi + + # Start port-forward in background + log_info "Starting port-forward to OSMO service..." + kubectl port-forward -n "${OSMO_NAMESPACE:-osmo}" svc/osmo-service 8080:80 &>/dev/null & + PF_PIDS+=($!) + + # Wait for port-forward to be ready + log_info "Waiting for port-forward to be ready..." + max_wait=30 + elapsed=0 + while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + elapsed=$((elapsed + 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi + done + log_success "Port-forward ready" + + # Login with dev method (auth is disabled) + log_info "Logging in to OSMO (dev method)..." + if ! osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then + log_error "Failed to login to OSMO. If Keycloak is enabled, set DEPLOY_KEYCLOAK=true" + exit 1 + fi + log_success "Logged in successfully" + + # Create service token + log_info "Creating service token: $TOKEN_NAME (expires: $EXPIRY_DATE)..." + TOKEN_OUTPUT=$(osmo token set "$TOKEN_NAME" \ + --expires-at "$EXPIRY_DATE" \ + --description "Backend Operator Token (auto-generated)" \ + --service --roles osmo-backend 2>&1) + + # Extract token from output (format: "Access token: ") + OSMO_SERVICE_TOKEN=$(echo "$TOKEN_OUTPUT" | sed -n 's/.*Access token: //p' | tr -d '\r' | xargs) + fi + + if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then + log_error "Failed to create service token" + echo "Response: ${TOKEN_RESPONSE:-$TOKEN_OUTPUT}" + exit 1 + fi + + log_success "Service token created: $TOKEN_NAME (expires: $EXPIRY_DATE)" + + # Stop port-forwards + cleanup_port_forwards + trap - EXIT + fi +fi + +# ----------------------------------------------------------------------------- +# Add OSMO Helm Repository +# ----------------------------------------------------------------------------- +log_info "Adding OSMO Helm repository..." +helm repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo --force-update +helm repo update + +# ----------------------------------------------------------------------------- +# Create Namespaces +# ----------------------------------------------------------------------------- +log_info "Creating namespaces..." +kubectl create namespace "${OSMO_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - +kubectl create namespace "${OSMO_WORKFLOWS_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + +# ----------------------------------------------------------------------------- +# Create Secrets +# ----------------------------------------------------------------------------- +log_info "Creating operator token secret..." +kubectl create secret generic osmo-operator-token \ + --namespace "${OSMO_OPERATOR_NAMESPACE}" \ + --from-literal=token="${OSMO_SERVICE_TOKEN}" \ + --dry-run=client -o yaml | kubectl apply -f - + +# ----------------------------------------------------------------------------- +# Create Values File +# ----------------------------------------------------------------------------- +log_info "Creating Helm values file..." + +# Note: services.backendListener/Worker are at root level, not under global +# See: osmo-helm-charts/backend-operator/values.yaml +cat > /tmp/backend_operator_values.yaml </dev/null || true + +echo "" +echo "========================================" +log_success "OSMO Backend Operator deployment complete!" +echo "========================================" +echo "" +echo "Backend Name: ${BACKEND_NAME}" +echo "Agent URL (WebSocket): ${OSMO_SERVICE_URL}" +echo "" +# Detect Ingress URL for verification instructions +INGRESS_URL=$(detect_service_url 2>/dev/null || true) + +echo "To verify the backend registration:" +echo "" +if [[ -n "$INGRESS_URL" ]]; then + echo " Check backend status:" + echo " osmo config show BACKEND ${BACKEND_NAME}" + echo "" + echo " Or via curl (using NGINX Ingress LoadBalancer):" + echo " curl ${INGRESS_URL}/api/configs/backend" +else + echo " Terminal 1 - Start port-forward (keep running):" + echo " kubectl port-forward -n osmo svc/osmo-service 8080:80" + echo "" + echo " Terminal 2 - Check backend status:" + echo " osmo config show BACKEND ${BACKEND_NAME}" + echo "" + echo " Or via curl:" + echo " curl http://localhost:8080/api/configs/backend" +fi +echo "" +echo "Next step - Configure Storage:" +echo " ./06-configure-storage.sh" +echo "" diff --git a/applications/osmo/deploy/002a-setup/06-configure-storage.sh b/applications/osmo/deploy/002a-setup/06-configure-storage.sh new file mode 100755 index 000000000..754540fc1 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/06-configure-storage.sh @@ -0,0 +1,244 @@ +#!/bin/bash +# +# Configure OSMO Storage +# https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/configure_data_storage.html +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " OSMO Storage Configuration" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 + +# ----------------------------------------------------------------------------- +# Get Storage Configuration from Terraform +# ----------------------------------------------------------------------------- +log_info "Retrieving storage configuration from Terraform..." + +S3_BUCKET=$(get_tf_output "storage_bucket.name" "../001-iac" 2>/dev/null || echo "") +S3_ENDPOINT=$(get_tf_output "storage_bucket.endpoint" "../001-iac" 2>/dev/null || echo "") + +# Default endpoint if not set +if [[ -z "$S3_ENDPOINT" ]]; then + S3_ENDPOINT="https://storage.eu-north1.nebius.cloud" +fi + +if [[ -z "$S3_BUCKET" ]]; then + log_error "Could not retrieve storage bucket name from Terraform" + echo "" + echo "Make sure you have run 'terraform apply' in deploy/001-iac" + echo "and that storage is enabled in your terraform.tfvars" + exit 1 +fi + +log_success "Storage bucket: ${S3_BUCKET}" +log_success "Storage endpoint: ${S3_ENDPOINT}" + +# ----------------------------------------------------------------------------- +# Check/Create osmo-storage secret +# ----------------------------------------------------------------------------- +log_info "Checking for osmo-storage secret..." + +if ! kubectl get secret osmo-storage -n osmo &>/dev/null; then + log_warning "osmo-storage secret not found - attempting to create from MysteryBox..." + + # Get credentials from Terraform/MysteryBox + S3_ACCESS_KEY=$(get_tf_output "storage_credentials.access_key_id" "../001-iac" 2>/dev/null || echo "") + S3_SECRET_REF_ID=$(get_tf_output "storage_secret_reference_id" "../001-iac" 2>/dev/null || echo "") + S3_SECRET_KEY="" + + if [[ -n "$S3_SECRET_REF_ID" ]]; then + log_info "Retrieving storage secret from MysteryBox..." + # IAM access key secrets are stored with key "secret" in MysteryBox + S3_SECRET_KEY=$(get_mysterybox_secret "$S3_SECRET_REF_ID" "secret" 2>/dev/null || echo "") + fi + + if [[ -z "$S3_ACCESS_KEY" || -z "$S3_SECRET_KEY" ]]; then + log_error "Could not retrieve storage credentials" + echo "" + echo "Either re-run 04-deploy-osmo-control-plane.sh or create the secret manually:" + echo "" + echo " kubectl create secret generic osmo-storage \\" + echo " --namespace osmo \\" + echo " --from-literal=access-key-id= \\" + echo " --from-literal=secret-access-key=" + exit 1 + fi + + # Create the secret + kubectl create secret generic osmo-storage \ + --namespace osmo \ + --from-literal=access-key-id="${S3_ACCESS_KEY}" \ + --from-literal=secret-access-key="${S3_SECRET_KEY}" \ + --dry-run=client -o yaml | kubectl apply -f - + + log_success "osmo-storage secret created" +else + log_success "osmo-storage secret exists" +fi + +# ----------------------------------------------------------------------------- +# Start port-forward and configure storage +# ----------------------------------------------------------------------------- +log_info "Starting port-forward to OSMO service..." + +OSMO_NS="${OSMO_NAMESPACE:-osmo}" + +start_osmo_port_forward "${OSMO_NS}" 8080 + +# Cleanup function +cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill $PORT_FORWARD_PID 2>/dev/null || true + wait $PORT_FORWARD_PID 2>/dev/null || true + fi +} +trap cleanup_port_forward EXIT + +# Wait for port-forward to be ready +log_info "Waiting for port-forward to be ready..." +max_wait=30 +elapsed=0 +while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + ((elapsed += 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi +done +log_success "Port-forward ready" + +# Login (no-op when bypassing Envoy -- curl headers handle auth) +osmo_login 8080 || exit 1 + +# ----------------------------------------------------------------------------- +# Get Storage Credentials +# ----------------------------------------------------------------------------- +log_info "Retrieving storage credentials..." + +# Get access key from Terraform +S3_ACCESS_KEY=$(get_tf_output "storage_credentials.access_key_id" "../001-iac" 2>/dev/null || echo "") + +# Get secret key from osmo-storage secret (already created) +S3_SECRET_KEY=$(kubectl get secret osmo-storage -n osmo -o jsonpath='{.data.secret-access-key}' 2>/dev/null | base64 -d 2>/dev/null || echo "") + +if [[ -z "$S3_ACCESS_KEY" || -z "$S3_SECRET_KEY" ]]; then + log_error "Could not retrieve storage credentials" + exit 1 +fi + +# Nebius Object Storage uses S3-compatible API +# OSMO uses TOS (Torch Object Storage) scheme for S3-compatible storage with custom endpoints +# Format: tos:/// +S3_HOST=$(echo "$S3_ENDPOINT" | sed 's|https://||') +BACKEND_URI="tos://${S3_HOST}/${S3_BUCKET}" +REGION="eu-north1" + +log_success "Storage credentials retrieved" + +# ----------------------------------------------------------------------------- +# Configure Workflow Log Storage in OSMO +# ----------------------------------------------------------------------------- +log_info "Configuring workflow log storage..." + +# Create workflow log config JSON +WORKFLOW_LOG_CONFIG=$(cat < /tmp/workflow_log_config.json + +if osmo_config_update WORKFLOW /tmp/workflow_log_config.json "Configure workflow log storage"; then + log_success "Workflow log storage configured" +else + log_error "Failed to configure workflow log storage" + rm -f /tmp/workflow_log_config.json + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Configure Workflow Data Storage in OSMO +# ----------------------------------------------------------------------------- +log_info "Configuring workflow data storage..." + +# Create workflow data config JSON +WORKFLOW_DATA_CONFIG=$(cat < /tmp/workflow_data_config.json + +if osmo_config_update WORKFLOW /tmp/workflow_data_config.json "Configure workflow data storage"; then + log_success "Workflow data storage configured" +else + log_error "Failed to configure workflow data storage" + rm -f /tmp/workflow_log_config.json /tmp/workflow_data_config.json + exit 1 +fi + +# Cleanup temp files +rm -f /tmp/workflow_log_config.json /tmp/workflow_data_config.json + +# ----------------------------------------------------------------------------- +# Verify Configuration +# ----------------------------------------------------------------------------- +log_info "Verifying storage configuration..." + +echo "" +echo "Workflow configuration:" +osmo_curl GET "http://localhost:8080/api/configs/workflow" 2>/dev/null | jq '.' || \ + log_warning "Could not retrieve workflow config for verification" + +# Cleanup +cleanup_port_forward +trap - EXIT + +echo "" +echo "========================================" +log_success "OSMO Storage configuration complete!" +echo "========================================" +echo "" +echo "Storage Details:" +echo " Bucket: ${S3_BUCKET}" +echo " Endpoint: ${S3_ENDPOINT}" +echo " Backend URI: ${BACKEND_URI}" +echo " Region: ${REGION}" +echo "" +echo "Configured:" +echo " - workflow_log: For storing workflow logs" +echo " - workflow_data: For storing intermediate task data" +echo "" +echo "OSMO can now store workflow artifacts in Nebius Object Storage." +echo "" diff --git a/applications/osmo/deploy/002a-setup/07-configure-service-url.sh b/applications/osmo/deploy/002a-setup/07-configure-service-url.sh new file mode 100755 index 000000000..f4781e718 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/07-configure-service-url.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# +# Configure OSMO Service URL +# Required for osmo-ctrl sidecar to communicate with OSMO service +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " OSMO Service URL Configuration" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 + +# ----------------------------------------------------------------------------- +# Start port-forward +# ----------------------------------------------------------------------------- +log_info "Starting port-forward to OSMO service..." + +OSMO_NS="${OSMO_NAMESPACE:-osmo}" + +start_osmo_port_forward "${OSMO_NS}" 8080 + +cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill $PORT_FORWARD_PID 2>/dev/null || true + wait $PORT_FORWARD_PID 2>/dev/null || true + fi +} +trap cleanup_port_forward EXIT + +# Wait for port-forward to be ready +log_info "Waiting for port-forward to be ready..." +max_wait=30 +elapsed=0 +while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + ((elapsed += 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi +done +log_success "Port-forward ready" + +# Login (no-op when bypassing Envoy -- curl headers handle auth) +osmo_login 8080 || exit 1 + +# ----------------------------------------------------------------------------- +# Determine the target service URL +# ----------------------------------------------------------------------------- +log_info "Determining target service URL..." + +# Priority: +# 1. Explicit OSMO_INGRESS_BASE_URL (user override) +# 2. Auto-detect from NGINX Ingress Controller LoadBalancer +if [[ -n "${OSMO_INGRESS_BASE_URL:-}" ]]; then + SERVICE_URL="${OSMO_INGRESS_BASE_URL}" + log_info "Using explicit Ingress base URL: ${SERVICE_URL}" +elif DETECTED_URL=$(detect_service_url 2>/dev/null) && [[ -n "$DETECTED_URL" ]]; then + SERVICE_URL="${DETECTED_URL}" + log_info "Auto-detected service URL: ${SERVICE_URL}" +else + log_error "Could not detect NGINX Ingress Controller URL." + log_error "Ensure 03-deploy-nginx-ingress.sh was run and the LoadBalancer has an IP." + if [[ "${OSMO_TLS_ENABLED:-false}" == "true" ]]; then + log_error "Or set OSMO_INGRESS_BASE_URL manually: export OSMO_INGRESS_BASE_URL=https://" + else + log_error "Or set OSMO_INGRESS_BASE_URL manually: export OSMO_INGRESS_BASE_URL=http://" + fi + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Check current service_base_url +# ----------------------------------------------------------------------------- +log_info "Checking current service_base_url..." + +CURRENT_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') +echo "Current service_base_url: '${CURRENT_URL}'" + +if [[ -n "$CURRENT_URL" && "$CURRENT_URL" != "null" && "$CURRENT_URL" == "$SERVICE_URL" ]]; then + log_success "service_base_url is already correctly configured: ${CURRENT_URL}" + cleanup_port_forward + trap - EXIT + exit 0 +elif [[ -n "$CURRENT_URL" && "$CURRENT_URL" != "null" ]]; then + log_warning "service_base_url is set to '${CURRENT_URL}' but should be '${SERVICE_URL}'" + log_info "Updating service_base_url..." +fi + +# ----------------------------------------------------------------------------- +# Configure service_base_url +# ----------------------------------------------------------------------------- +log_info "Configuring service_base_url to: ${SERVICE_URL}" + +cat > /tmp/service_url_fix.json << EOF +{ + "service_base_url": "${SERVICE_URL}" +} +EOF + +if osmo_config_update SERVICE /tmp/service_url_fix.json "Set service_base_url for osmo-ctrl sidecar"; then + log_success "service_base_url configured" +else + log_error "Failed to configure service_base_url" + rm -f /tmp/service_url_fix.json + exit 1 +fi + +rm -f /tmp/service_url_fix.json + +# ----------------------------------------------------------------------------- +# Verify Configuration +# ----------------------------------------------------------------------------- +log_info "Verifying configuration..." + +NEW_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') + +if [[ "$NEW_URL" == "$SERVICE_URL" ]]; then + log_success "service_base_url verified: ${NEW_URL}" +else + log_error "Verification failed. Expected: ${SERVICE_URL}, Got: ${NEW_URL}" + exit 1 +fi + +# Cleanup +cleanup_port_forward +trap - EXIT + +echo "" +echo "========================================" +log_success "OSMO Service URL configuration complete!" +echo "========================================" +echo "" +echo "Service URL: ${SERVICE_URL}" +echo "" +echo "This URL is used by the osmo-ctrl sidecar container to:" +echo " - Stream workflow logs to the OSMO service" +echo " - Report task status and completion" +echo " - Fetch authentication tokens" +echo "" diff --git a/applications/osmo/deploy/002a-setup/08-configure-gpu-platform.sh b/applications/osmo/deploy/002a-setup/08-configure-gpu-platform.sh new file mode 100755 index 000000000..c0f6775ab --- /dev/null +++ b/applications/osmo/deploy/002a-setup/08-configure-gpu-platform.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# Configure OSMO GPU platform with tolerations via pod templates +# Based on OSMO documentation: https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/resource_pools.html + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +OSMO_URL="${OSMO_URL:-http://localhost:8080}" +OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" + +echo "" +echo "========================================" +echo " OSMO GPU Platform Configuration" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 + +# ----------------------------------------------------------------------------- +# Start port-forward +# ----------------------------------------------------------------------------- +log_info "Starting port-forward to OSMO service..." + +start_osmo_port_forward "${OSMO_NAMESPACE}" 8080 + +cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill $PORT_FORWARD_PID 2>/dev/null || true + wait $PORT_FORWARD_PID 2>/dev/null || true + fi +} +trap cleanup_port_forward EXIT + +# Wait for port-forward to be ready +log_info "Waiting for port-forward to be ready..." +max_wait=30 +elapsed=0 +while ! curl -s -o /dev/null -w "%{http_code}" "${OSMO_URL}/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + ((elapsed += 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi +done +log_success "Port-forward ready" + +# ----------------------------------------------------------------------------- +# Step 1: Create GPU pod template +# ----------------------------------------------------------------------------- +log_info "Creating gpu_tolerations pod template..." + +RESPONSE=$(osmo_curl PUT "${OSMO_URL}/api/configs/pod_template/gpu_tolerations" \ + -w "\n%{http_code}" \ + -d @"${SCRIPT_DIR}/gpu_pod_template.json") +HTTP_CODE=$(echo "$RESPONSE" | tail -n1) +BODY=$(echo "$RESPONSE" | sed '$d') + +if [[ "$HTTP_CODE" -ge 200 && "$HTTP_CODE" -lt 300 ]]; then + log_success "Pod template created (HTTP ${HTTP_CODE})" +else + log_error "Failed to create pod template (HTTP ${HTTP_CODE})" + echo "Response: ${BODY}" + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Step 2: Create GPU platform +# ----------------------------------------------------------------------------- +log_info "Creating gpu platform in default pool..." + +RESPONSE=$(osmo_curl PUT "${OSMO_URL}/api/configs/pool/default/platform/gpu" \ + -w "\n%{http_code}" \ + -d @"${SCRIPT_DIR}/gpu_platform_update.json") +HTTP_CODE=$(echo "$RESPONSE" | tail -n1) +BODY=$(echo "$RESPONSE" | sed '$d') + +if [[ "$HTTP_CODE" -ge 200 && "$HTTP_CODE" -lt 300 ]]; then + log_success "GPU platform created (HTTP ${HTTP_CODE})" +else + log_error "Failed to create GPU platform (HTTP ${HTTP_CODE})" + echo "Response: ${BODY}" + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Step 3: Verify configuration +# ----------------------------------------------------------------------------- +log_info "Verifying configuration..." + +echo "" +echo "Pod templates:" +osmo_curl GET "${OSMO_URL}/api/configs/pod_template" | jq 'keys' + +echo "" +echo "GPU platform config:" +osmo_curl GET "${OSMO_URL}/api/configs/pool/default" | jq '.platforms.gpu' + +# ----------------------------------------------------------------------------- +# Step 4: Check GPU resources +# ----------------------------------------------------------------------------- +log_info "Checking GPU resources..." +sleep 3 # Wait for backend to pick up changes + +RESOURCE_JSON=$(osmo_curl GET "${OSMO_URL}/api/resources" 2>/dev/null || echo '{}') +RESOURCE_COUNT=$(echo "$RESOURCE_JSON" | jq '[(.resources // [])[] | select(.allocatable_fields.gpu != null)] | length' 2>/dev/null || echo "0") +echo "GPU nodes visible to OSMO: ${RESOURCE_COUNT}" + +if [[ "$RESOURCE_COUNT" -gt 0 ]]; then + echo "" + echo "GPU resources:" + echo "$RESOURCE_JSON" | jq '.resources[] | select(.allocatable_fields.gpu != null) | {name: .name, gpu: .allocatable_fields.gpu, cpu: .allocatable_fields.cpu, memory: .allocatable_fields.memory}' +fi + +# ----------------------------------------------------------------------------- +# Done +# ----------------------------------------------------------------------------- +log_success "GPU platform configuration complete" +echo "" +echo "To submit a GPU workflow:" +echo " osmo workflow submit workflows/osmo/gpu_test.yaml -p default" +echo "" +echo "Or test via curl:" +echo " curl -X POST ${OSMO_URL}/api/workflow -H 'Content-Type: application/yaml' --data-binary @workflows/osmo/gpu_test.yaml" diff --git a/applications/osmo/deploy/002a-setup/README.md b/applications/osmo/deploy/002a-setup/README.md new file mode 100755 index 000000000..05ec8b55c --- /dev/null +++ b/applications/osmo/deploy/002a-setup/README.md @@ -0,0 +1,363 @@ +# Kubernetes Setup Scripts + +This directory contains scripts for configuring the Kubernetes cluster with GPU infrastructure and OSMO components. + +## Prerequisites + +1. Complete infrastructure deployment (001-iac) +2. kubectl configured with cluster access: + ```bash + nebius mk8s cluster get-credentials --id --external + ``` + +## Deployment Order + +Run scripts in order: + +```bash +# 1. GPU Infrastructure (GPU Operator, Network Operator, KAI Scheduler) +./01-deploy-gpu-infrastructure.sh + +# 2. Observability (Prometheus, Grafana, Loki) +./02-deploy-observability.sh + +# 3. NGINX Ingress Controller (required – provides routing for OSMO services) +./03-deploy-nginx-ingress.sh + +# 4. OSMO Control Plane +./04-deploy-osmo-control-plane.sh + +# 5. OSMO Backend +./05-deploy-osmo-backend.sh + +# 6. Configure Storage (requires port-forward, see main README) +./06-configure-storage.sh + +# 7. Configure GPU Platform (required for GPU workflows) +./08-configure-gpu-platform.sh +``` + +## Scripts + +| Script | Purpose | Duration | +|--------|---------|----------| +| `01-deploy-gpu-infrastructure.sh` | GPU Operator, Network Operator, KAI Scheduler | ~15 min | +| `02-deploy-observability.sh` | Prometheus, Grafana, Loki, Promtail | ~10 min | +| `03-deploy-nginx-ingress.sh` | NGINX Ingress Controller (routing for OSMO services) | ~2 min | +| `04-deploy-osmo-control-plane.sh` | OSMO Control Plane, Ingress resources, database secrets, service URL | ~5 min | +| `05-deploy-osmo-backend.sh` | OSMO Backend operator | ~5 min | +| `06-configure-storage.sh` | Configure S3-compatible storage for workflow logs/data | ~1 min | +| `07-configure-service-url.sh` | Reconfigure service URL manually (usually not needed) | ~1 min | +| `08-configure-gpu-platform.sh` | Configure GPU platform with tolerations/node selector | ~1 min | + +## Configuration + +### Helm Values + +Customize deployments by editing files in `values/`: + +| File | Component | +|------|-----------| +| `gpu-operator.yaml` | NVIDIA GPU Operator | +| `network-operator.yaml` | NVIDIA Network Operator | +| `kai-scheduler.yaml` | KAI GPU Scheduler | +| `prometheus.yaml` | Prometheus + Grafana | +| `loki.yaml` | Loki Log Aggregation | +| `promtail.yaml` | Log Collection | + +### Environment Variables + +Configure via `defaults.sh` or export before running: + +```bash +# Namespaces +GPU_OPERATOR_NAMESPACE="gpu-operator" +NETWORK_OPERATOR_NAMESPACE="network-operator" +MONITORING_NAMESPACE="monitoring" +OSMO_NAMESPACE="osmo" + +# Grafana password (auto-generated if empty) +GRAFANA_ADMIN_PASSWORD="" + +# NGINX Ingress (deploy 03-deploy-nginx-ingress.sh before 04-deploy-osmo-control-plane.sh) +OSMO_INGRESS_HOSTNAME="" # hostname for Ingress rules (e.g. osmo.example.com); leave empty for IP-based access +OSMO_INGRESS_BASE_URL="" # override for service_base_url; auto-detected from LoadBalancer if empty +``` + +### Secrets from MysteryBox + +If you ran `secrets-init.sh` in the prerequisites step, the following environment variables are set: + +| Variable | Description | +|----------|-------------| +| `TF_VAR_postgresql_mysterybox_secret_id` | MysteryBox secret ID for PostgreSQL password | +| `TF_VAR_mek_mysterybox_secret_id` | MysteryBox secret ID for MEK (Master Encryption Key) | + +The `04-deploy-osmo-control-plane.sh` script automatically reads these secrets from MysteryBox. This keeps sensitive credentials out of Terraform state and provides a secure secrets management workflow. + +**Secret retrieval order:** +1. **MysteryBox** (if secret ID is set via `TF_VAR_*` or `OSMO_*` env vars) +2. **Terraform outputs** (fallback) +3. **Environment variables** (fallback) +4. **Interactive prompt** (last resort) + +To manually retrieve secrets from MysteryBox: +```bash +# PostgreSQL password +nebius mysterybox v1 payload get-by-key \ + --secret-id $TF_VAR_postgresql_mysterybox_secret_id \ + --key password --format json | jq -r '.data.string_value' + +# MEK (Master Encryption Key) +nebius mysterybox v1 payload get-by-key \ + --secret-id $TF_VAR_mek_mysterybox_secret_id \ + --key mek --format json | jq -r '.data.string_value' +``` + +## Accessing Services + +### Grafana Dashboard + +```bash +kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80 +# Open http://localhost:3000 +# User: admin +# Password: (shown during deployment or in defaults.sh) +``` + +### Prometheus + +```bash +kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 +# Open http://localhost:9090 +``` + +### OSMO API + +```bash +kubectl port-forward -n osmo svc/osmo-service 8080:80 +# Open http://localhost:8080 +``` + +### OSMO Web UI + +```bash +kubectl port-forward -n osmo svc/osmo-ui 8081:80 +# Open http://localhost:8081 +``` + +## Cleanup + +Run cleanup scripts in reverse order: + +```bash +cd cleanup + +# Remove OSMO +./uninstall-osmo-backend.sh +./uninstall-osmo-control-plane.sh + +# Remove observability +./uninstall-observability.sh + +# Remove GPU infrastructure +./uninstall-gpu-infrastructure.sh +``` + +## Configure OSMO GPU Platform + +After deploying OSMO backend, configure the GPU platform so OSMO can schedule workloads on GPU nodes. + +### Why is this needed? + +Nebius GPU nodes have a taint `nvidia.com/gpu=true:NoSchedule` that prevents pods from being scheduled unless they have matching tolerations. OSMO needs to be configured with: + +1. A **pod template** with GPU tolerations and node selector +2. A **GPU platform** that references this pod template + +### Option 1: Run the Configuration Script (Recommended) + +```bash +./08-configure-gpu-platform.sh +``` + +### Option 2: Manual Configuration via API + +With port-forward running (`kubectl port-forward -n osmo svc/osmo-service 8080:80`): + +**Step 1: Create GPU Pod Template** + +```bash +curl -X PUT 'http://localhost:8080/api/configs/pod_template/gpu_tolerations' \ + -H 'Content-Type: application/json' \ + -d @gpu_pod_template.json +``` + +Where `gpu_pod_template.json` contains: + +```json +{ + "configs": { + "spec": { + "tolerations": [ + { + "key": "nvidia.com/gpu", + "operator": "Exists", + "effect": "NoSchedule" + } + ], + "nodeSelector": { + "nvidia.com/gpu.present": "true" + } + } + } +} +``` + +**Step 2: Create GPU Platform** + +```bash +curl -X PUT 'http://localhost:8080/api/configs/pool/default/platform/gpu' \ + -H 'Content-Type: application/json' \ + -d @gpu_platform_update.json +``` + +Where `gpu_platform_update.json` contains: + +```json +{ + "configs": { + "description": "GPU platform for L40S nodes", + "host_network_allowed": false, + "privileged_allowed": false, + "allowed_mounts": [], + "default_mounts": [], + "default_variables": { + "USER_GPU": 1 + }, + "resource_validations": [], + "override_pod_template": ["gpu_tolerations"] + } +} +``` + +### Verify Configuration + +```bash +# Check pod templates +curl -s http://localhost:8080/api/configs/pod_template | jq 'keys' +# Should include: "gpu_tolerations" + +# Check GPU platform +curl -s http://localhost:8080/api/configs/pool/default | jq '.platforms.gpu' + +# Check resources (GPU nodes should now be visible) +curl -s http://localhost:8080/api/resources | jq '.resources[] | {name: .name, gpu: .allocatable_fields.gpu}' +``` + +### Using GPU in Workflows + +Specify `platform: gpu` in your OSMO workflow: + +```yaml +workflow: + name: my-gpu-job + resources: + gpu-resource: + platform: gpu # <-- Selects GPU platform with tolerations + gpu: 1 + memory: 4Gi + tasks: + - name: train + image: nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04 + command: ["nvidia-smi"] + resource: gpu-resource +``` + +## Troubleshooting + +### GPU Nodes Not Ready + +1. Check GPU operator pods: + ```bash + kubectl get pods -n gpu-operator + ``` + +2. Check node labels: + ```bash + kubectl get nodes -l node-type=gpu --show-labels + ``` + +3. Check DCGM exporter: + ```bash + kubectl logs -n gpu-operator -l app=nvidia-dcgm-exporter + ``` + +### Pods Pending on GPU Nodes + +1. Verify tolerations: + ```bash + kubectl describe pod | grep -A5 Tolerations + ``` + +2. Check node taints: + ```bash + kubectl describe node | grep Taints + ``` + +### InfiniBand Issues + +1. Check Network Operator: + ```bash + kubectl get pods -n network-operator + ``` + +2. Verify RDMA devices: + ```bash + kubectl exec -n gpu-operator -- ibstat + ``` + +### Database Connection Failed + +1. Verify PostgreSQL is accessible: + ```bash + kubectl get secret osmo-database -n osmo -o yaml + ``` + +2. Test connection from a pod: + ```bash + kubectl run pg-test --rm -it --image=postgres:16 -- psql -h -U -d + ``` + +### OSMO Not Seeing GPU Resources + +If OSMO shows 0 GPUs or GPU workflows fail to schedule: + +1. Check if GPU platform is configured: + ```bash + curl -s http://localhost:8080/api/configs/pool/default | jq '.platforms | keys' + # Should include "gpu" + ``` + +2. Check if GPU pod template exists: + ```bash + curl -s http://localhost:8080/api/configs/pod_template | jq 'keys' + # Should include "gpu_tolerations" + ``` + +3. Check GPU node labels and taints: + ```bash + kubectl describe node | grep -E 'Taints:|nvidia.com/gpu' + # Should show taint: nvidia.com/gpu=true:NoSchedule + # Should show label: nvidia.com/gpu.present=true + ``` + +4. If missing, run the GPU configuration: + ```bash + ./08-configure-gpu-platform.sh + ``` + +5. Verify OSMO sees GPU resources: + ```bash + curl -s http://localhost:8080/api/resources | jq '.resources[] | select(.allocatable_fields.gpu != null)' + ``` diff --git a/applications/osmo/deploy/002a-setup/cleanup/uninstall-gpu-infrastructure.sh b/applications/osmo/deploy/002a-setup/cleanup/uninstall-gpu-infrastructure.sh new file mode 100755 index 000000000..de869a0cf --- /dev/null +++ b/applications/osmo/deploy/002a-setup/cleanup/uninstall-gpu-infrastructure.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# Uninstall GPU Infrastructure +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/../lib/common.sh" +source "${SCRIPT_DIR}/../defaults.sh" + +echo "" +echo "========================================" +echo " Uninstalling GPU Infrastructure" +echo "========================================" +echo "" + +log_warning "This will remove GPU Operator, Network Operator, and KAI Scheduler" +read_prompt_var "Continue? (y/N)" confirm "" +if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "Cancelled" + exit 0 +fi + +log_info "Removing KAI Scheduler..." +helm uninstall kai-scheduler -n "${KAI_SCHEDULER_NAMESPACE}" 2>/dev/null || true +kubectl delete namespace "${KAI_SCHEDULER_NAMESPACE}" --ignore-not-found + +log_info "Removing Network Operator..." +helm uninstall network-operator -n "${NETWORK_OPERATOR_NAMESPACE}" 2>/dev/null || true +kubectl delete namespace "${NETWORK_OPERATOR_NAMESPACE}" --ignore-not-found + +log_info "Removing GPU Operator..." +helm uninstall gpu-operator -n "${GPU_OPERATOR_NAMESPACE}" 2>/dev/null || true + +# Remove GPU Operator CRDs +log_info "Removing GPU Operator CRDs..." +kubectl delete crd clusterpolicies.nvidia.com --ignore-not-found +kubectl delete crd nvidiadrivers.nvidia.com --ignore-not-found + +kubectl delete namespace "${GPU_OPERATOR_NAMESPACE}" --ignore-not-found + +log_success "GPU infrastructure uninstalled" diff --git a/applications/osmo/deploy/002a-setup/cleanup/uninstall-keycloak.sh b/applications/osmo/deploy/002a-setup/cleanup/uninstall-keycloak.sh new file mode 100755 index 000000000..caeaa8b74 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/cleanup/uninstall-keycloak.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Uninstall Keycloak and disable OSMO authentication +# This removes Keycloak and related secrets. After running this, re-deploy +# OSMO control plane without DEPLOY_KEYCLOAK to switch back to open API mode. +set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +OSMO_NS="${OSMO_NAMESPACE:-osmo}" +KC_TLS_SECRET="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" +INGRESS_NS="${INGRESS_NAMESPACE:-ingress-nginx}" + +echo "" +echo "========================================" +echo " Uninstall Keycloak" +echo "========================================" +echo "" + +check_kubectl || exit 1 +check_helm || exit 1 + +# Step 1: Uninstall Keycloak Helm release +log_info "Uninstalling Keycloak Helm release..." +helm uninstall keycloak --namespace "${OSMO_NS}" 2>/dev/null || log_info "Keycloak Helm release not found (already removed)" + +# Step 2: Delete Keycloak config job and realm ConfigMap +log_info "Cleaning up Keycloak configuration job and ConfigMap..." +kubectl delete job keycloak-osmo-setup -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true +kubectl delete configmap keycloak-realm-json -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + +# Step 3: Delete Keycloak-related secrets +log_info "Deleting Keycloak secrets..." +kubectl delete secret keycloak-admin-secret -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true +kubectl delete secret keycloak-db-secret -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true +kubectl delete secret oidc-secrets -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true +log_success "Keycloak secrets deleted" + +# Step 4: Delete Keycloak TLS secret +log_info "Deleting Keycloak TLS secret (${KC_TLS_SECRET})..." +kubectl delete secret "${KC_TLS_SECRET}" -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true +kubectl delete secret "${KC_TLS_SECRET}" -n "${INGRESS_NS}" --ignore-not-found 2>/dev/null || true +log_success "Keycloak TLS secrets deleted" + +# Step 5: Delete Keycloak PVCs (if any) +log_info "Cleaning up Keycloak PVCs..." +kubectl delete pvc -l app.kubernetes.io/name=keycloak -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + +echo "" +log_success "Keycloak uninstalled" +echo "" +echo "Next steps:" +echo " 1. Re-deploy OSMO control plane without authentication:" +echo " unset DEPLOY_KEYCLOAK" +echo " ./05-deploy-osmo-control-plane.sh" +echo "" +echo " 2. (Optional) Drop the Keycloak database from PostgreSQL:" +echo " Connect to your Managed PostgreSQL and run:" +echo " DROP DATABASE IF EXISTS keycloak;" +echo "" +echo " 3. (Optional) Remove the DNS A record for the auth subdomain" +echo "" diff --git a/applications/osmo/deploy/002a-setup/cleanup/uninstall-nginx-ingress.sh b/applications/osmo/deploy/002a-setup/cleanup/uninstall-nginx-ingress.sh new file mode 100755 index 000000000..471029d5c --- /dev/null +++ b/applications/osmo/deploy/002a-setup/cleanup/uninstall-nginx-ingress.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Uninstall NGINX Ingress Controller (deployed by 03-deploy-nginx-ingress.sh) +set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}" +INGRESS_RELEASE_NAME="${INGRESS_RELEASE_NAME:-ingress-nginx}" +log_info "Uninstalling NGINX Ingress Controller..." +helm uninstall "${INGRESS_RELEASE_NAME}" -n "${INGRESS_NAMESPACE}" 2>/dev/null || true +kubectl delete namespace "${INGRESS_NAMESPACE}" --ignore-not-found --timeout=60s 2>/dev/null || true +log_success "NGINX Ingress Controller uninstalled" + +# Uninstall cert-manager (if installed) +if helm status cert-manager -n cert-manager &>/dev/null; then + log_info "Uninstalling cert-manager..." + kubectl delete clusterissuer letsencrypt --ignore-not-found 2>/dev/null || true + helm uninstall cert-manager -n cert-manager 2>/dev/null || true + kubectl delete namespace cert-manager --ignore-not-found --timeout=60s 2>/dev/null || true + log_success "cert-manager uninstalled" +fi diff --git a/applications/osmo/deploy/002a-setup/cleanup/uninstall-observability.sh b/applications/osmo/deploy/002a-setup/cleanup/uninstall-observability.sh new file mode 100755 index 000000000..e847de5a6 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/cleanup/uninstall-observability.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# +# Uninstall Observability Stack +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/../lib/common.sh" +source "${SCRIPT_DIR}/../defaults.sh" + +echo "" +echo "========================================" +echo " Uninstalling Observability Stack" +echo "========================================" +echo "" + +log_warning "This will remove Prometheus, Grafana, and Loki" +# Read input with a prompt into a variable (bash/zsh compatible). +read_prompt_var() { + local prompt=$1 + local var_name=$2 + local default=$3 + local value="" + local read_from="/dev/tty" + local write_to="/dev/tty" + + if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then + read_from="/dev/stdin" + write_to="/dev/stdout" + fi + + if [[ -n "$default" ]]; then + printf "%s [%s]: " "$prompt" "$default" >"$write_to" + else + printf "%s: " "$prompt" >"$write_to" + fi + + IFS= read -r value <"$read_from" + if [[ -z "$value" && -n "$default" ]]; then + value="$default" + fi + + eval "$var_name='$value'" +} + +read_prompt_var "Continue? (y/N)" confirm "" +if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "Cancelled" + exit 0 +fi + +log_info "Removing Promtail..." +helm uninstall promtail -n "${MONITORING_NAMESPACE}" 2>/dev/null || true + +log_info "Removing Loki..." +helm uninstall loki -n "${MONITORING_NAMESPACE}" 2>/dev/null || true + +log_info "Removing Prometheus stack..." +helm uninstall prometheus -n "${MONITORING_NAMESPACE}" 2>/dev/null || true + +# Remove CRDs +log_info "Removing Prometheus CRDs..." +kubectl delete crd alertmanagerconfigs.monitoring.coreos.com --ignore-not-found +kubectl delete crd alertmanagers.monitoring.coreos.com --ignore-not-found +kubectl delete crd podmonitors.monitoring.coreos.com --ignore-not-found +kubectl delete crd probes.monitoring.coreos.com --ignore-not-found +kubectl delete crd prometheuses.monitoring.coreos.com --ignore-not-found +kubectl delete crd prometheusrules.monitoring.coreos.com --ignore-not-found +kubectl delete crd servicemonitors.monitoring.coreos.com --ignore-not-found +kubectl delete crd thanosrulers.monitoring.coreos.com --ignore-not-found + +log_info "Removing monitoring namespace..." +kubectl delete namespace "${MONITORING_NAMESPACE}" --ignore-not-found + +log_success "Observability stack uninstalled" diff --git a/applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-backend.sh b/applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-backend.sh new file mode 100755 index 000000000..cce604c99 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-backend.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# +# Uninstall OSMO Backend Operator +# Reverses everything deployed by 06-deploy-osmo-backend.sh +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/../lib/common.sh" +source "${SCRIPT_DIR}/../defaults.sh" + +OSMO_OPERATOR_NAMESPACE="osmo-operator" +OSMO_WORKFLOWS_NAMESPACE="osmo-workflows" + +echo "" +echo "========================================" +echo " Uninstalling OSMO Backend Operator" +echo "========================================" +echo "" + +log_warning "This will remove:" +echo " - Helm release: osmo-operator (namespace: ${OSMO_OPERATOR_NAMESPACE})" +echo " - Secret: osmo-operator-token (namespace: ${OSMO_OPERATOR_NAMESPACE})" +echo " - Namespace: ${OSMO_OPERATOR_NAMESPACE}" +echo " - Namespace: ${OSMO_WORKFLOWS_NAMESPACE} (and all workflow pods)" +echo "" +read_prompt_var "Continue? (y/N)" confirm "" +if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "Cancelled" + exit 0 +fi + +# Uninstall Helm release +if helm status osmo-operator -n "${OSMO_OPERATOR_NAMESPACE}" &>/dev/null; then + log_info "Uninstalling Helm release: osmo-operator..." + helm uninstall osmo-operator -n "${OSMO_OPERATOR_NAMESPACE}" --wait --timeout 5m + log_success "Helm release uninstalled" +else + log_info "Helm release osmo-operator not found — skipping" +fi + +# Delete secrets +log_info "Removing secrets..." +kubectl delete secret osmo-operator-token -n "${OSMO_OPERATOR_NAMESPACE}" --ignore-not-found + +# Delete the internal agent service (created by 05-deploy-osmo-control-plane.sh for backend operator) +log_info "Removing osmo-agent-internal service..." +kubectl delete svc osmo-agent-internal -n "${OSMO_NAMESPACE}" --ignore-not-found + +# Delete namespaces (this also removes any remaining resources inside them) +log_info "Deleting namespace: ${OSMO_WORKFLOWS_NAMESPACE}..." +kubectl delete namespace "${OSMO_WORKFLOWS_NAMESPACE}" --ignore-not-found --wait=false + +log_info "Deleting namespace: ${OSMO_OPERATOR_NAMESPACE}..." +kubectl delete namespace "${OSMO_OPERATOR_NAMESPACE}" --ignore-not-found --wait=false + +echo "" +log_success "OSMO Backend Operator uninstalled" +echo "" +echo "Note: Namespace deletion may continue in the background." +echo " kubectl get ns ${OSMO_OPERATOR_NAMESPACE} ${OSMO_WORKFLOWS_NAMESPACE} 2>/dev/null" +echo "" diff --git a/applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-control-plane.sh b/applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-control-plane.sh new file mode 100755 index 000000000..0abb5f560 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-control-plane.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# +# Uninstall OSMO Control Plane +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/../lib/common.sh" +source "${SCRIPT_DIR}/../defaults.sh" + +echo "" +echo "========================================" +echo " Uninstalling OSMO Control Plane" +echo "========================================" +echo "" + +log_warning "This will remove OSMO Control Plane and all OSMO resources" +read_prompt_var "Continue? (y/N)" confirm "" +if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "Cancelled" + exit 0 +fi + +log_info "Removing OSMO Control Plane..." +kubectl delete deployment osmo-control-plane -n "${OSMO_NAMESPACE}" --ignore-not-found +kubectl delete service osmo-control-plane -n "${OSMO_NAMESPACE}" --ignore-not-found +kubectl delete secret osmo-database -n "${OSMO_NAMESPACE}" --ignore-not-found +kubectl delete secret osmo-storage -n "${OSMO_NAMESPACE}" --ignore-not-found + +log_info "Removing OSMO namespace..." +kubectl delete namespace "${OSMO_NAMESPACE}" --ignore-not-found + +log_success "OSMO Control Plane uninstalled" diff --git a/applications/osmo/deploy/002a-setup/defaults.sh b/applications/osmo/deploy/002a-setup/defaults.sh new file mode 100755 index 000000000..94aaa8e1b --- /dev/null +++ b/applications/osmo/deploy/002a-setup/defaults.sh @@ -0,0 +1,72 @@ +# ============================================================================= +# Default Configuration for Setup Scripts +# ============================================================================= + +# Namespaces +export GPU_OPERATOR_NAMESPACE="gpu-operator" +export NETWORK_OPERATOR_NAMESPACE="network-operator" +export KAI_SCHEDULER_NAMESPACE="kai-scheduler" +export MONITORING_NAMESPACE="monitoring" +export OSMO_NAMESPACE="osmo" + +# Chart versions (leave empty for latest) +export GPU_OPERATOR_VERSION="" +export NETWORK_OPERATOR_VERSION="" +export KAI_SCHEDULER_VERSION="v0.12.4" # Check https://github.com/NVIDIA/KAI-Scheduler/releases +export PROMETHEUS_VERSION="" +export GRAFANA_VERSION="" +export LOKI_VERSION="" + +# GPU Operator settings +export GPU_DRIVER_ENABLED="false" # Use Nebius driver-full images +export TOOLKIT_ENABLED="true" +export DEVICE_PLUGIN_ENABLED="true" +export MIG_MANAGER_ENABLED="false" + +# Network Operator (only needed for InfiniBand/GPU clusters) +export ENABLE_NETWORK_OPERATOR="false" # Set to "true" if using InfiniBand + +# Observability settings +export PROMETHEUS_RETENTION_DAYS="15" +export LOKI_RETENTION_DAYS="7" +export GRAFANA_ADMIN_PASSWORD="" # Auto-generated if empty + +# NGINX Ingress Controller (deployed by 03-deploy-nginx-ingress.sh) +# Namespace where the NGINX Ingress Controller is deployed. +export INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}" +# Hostname for Ingress rules (e.g. osmo.example.com). Leave empty to use the LoadBalancer IP directly. +export OSMO_INGRESS_HOSTNAME="${OSMO_INGRESS_HOSTNAME:-}" +# Override for the service_base_url used by osmo-ctrl. Auto-detected from the ingress LoadBalancer if empty. +export OSMO_INGRESS_BASE_URL="${OSMO_INGRESS_BASE_URL:-}" + +# TLS / SSL Configuration +# Set OSMO_TLS_ENABLED=true after running 03a (certbot) or 03c (cert-manager). +export OSMO_TLS_ENABLED="${OSMO_TLS_ENABLED:-false}" +# Name of the Kubernetes TLS secret used by Ingress (both paths produce this secret). +# NOTE: The OSMO Helm chart generates ingress TLS with secretName "osmo-tls". +export OSMO_TLS_SECRET_NAME="${OSMO_TLS_SECRET_NAME:-osmo-tls}" +# Local directory where certbot stores certificate files (Path A only). +export OSMO_TLS_CERT_DIR="${OSMO_TLS_CERT_DIR:-$HOME/.osmo-certs}" +# Email for Let's Encrypt registration (required for 03a and 03c). +export LETSENCRYPT_EMAIL="${LETSENCRYPT_EMAIL:-}" +# cert-manager namespace (Path B / 03c only). +export CERT_MANAGER_NAMESPACE="${CERT_MANAGER_NAMESPACE:-cert-manager}" +# Name of the ClusterIssuer created by 03c (Path B only). +export CLUSTER_ISSUER_NAME="${CLUSTER_ISSUER_NAME:-letsencrypt-prod}" +# TLS mode: "certbot" or "cert-manager". Set automatically by 03a/03c. +export OSMO_TLS_MODE="${OSMO_TLS_MODE:-}" + +# Keycloak / Authentication +# Set DEPLOY_KEYCLOAK=true to deploy Keycloak and enable OSMO auth with Envoy sidecars. +export DEPLOY_KEYCLOAK="${DEPLOY_KEYCLOAK:-false}" +# Keycloak hostname (e.g. auth-osmo-nebius.csptst.nvidia.com). +# Auto-derived from OSMO_INGRESS_HOSTNAME if empty: auth-. +export KEYCLOAK_HOSTNAME="${KEYCLOAK_HOSTNAME:-}" +# TLS secret name for the Keycloak ingress (separate from the main osmo-tls). +# Run 03a with OSMO_TLS_SECRET_NAME=osmo-tls-auth for the auth subdomain. +export KEYCLOAK_TLS_SECRET_NAME="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" + +# Paths +export SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +export VALUES_DIR="${SCRIPT_DIR}/values" +export LIB_DIR="${SCRIPT_DIR}/lib" diff --git a/applications/osmo/deploy/002a-setup/gpu_platform_update.json b/applications/osmo/deploy/002a-setup/gpu_platform_update.json new file mode 100755 index 000000000..56c0764fe --- /dev/null +++ b/applications/osmo/deploy/002a-setup/gpu_platform_update.json @@ -0,0 +1,14 @@ +{ + "configs": { + "description": "GPU platform for L40S nodes", + "host_network_allowed": false, + "privileged_allowed": false, + "allowed_mounts": [], + "default_mounts": [], + "default_variables": { + "USER_GPU": 1 + }, + "resource_validations": [], + "override_pod_template": ["gpu_tolerations"] + } +} diff --git a/applications/osmo/deploy/002a-setup/gpu_pod_template.json b/applications/osmo/deploy/002a-setup/gpu_pod_template.json new file mode 100755 index 000000000..ae651e3ba --- /dev/null +++ b/applications/osmo/deploy/002a-setup/gpu_pod_template.json @@ -0,0 +1,16 @@ +{ + "configs": { + "spec": { + "tolerations": [ + { + "key": "nvidia.com/gpu", + "operator": "Exists", + "effect": "NoSchedule" + } + ], + "nodeSelector": { + "nvidia.com/gpu.present": "true" + } + } + } +} diff --git a/applications/osmo/deploy/002a-setup/lib/common.sh b/applications/osmo/deploy/002a-setup/lib/common.sh new file mode 100755 index 000000000..87abfb573 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/lib/common.sh @@ -0,0 +1,434 @@ +#!/bin/bash +# +# Common functions for setup scripts +# + +# Colors +export RED='\033[0;31m' +export GREEN='\033[0;32m' +export YELLOW='\033[1;33m' +export BLUE='\033[0;34m' +export NC='\033[0m' + +# Logging functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[✓]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[!]${NC} $1" +} + +log_error() { + echo -e "${RED}[✗]${NC} $1" +} + +# Read input with a prompt into a variable (bash/zsh compatible). +read_prompt_var() { + local prompt=$1 + local var_name=$2 + local default=$3 + local value="" + local read_from="/dev/tty" + local write_to="/dev/tty" + + if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then + read_from="/dev/stdin" + write_to="/dev/stdout" + fi + + if [[ -n "$default" ]]; then + printf "%s [%s]: " "$prompt" "$default" >"$write_to" + else + printf "%s: " "$prompt" >"$write_to" + fi + + IFS= read -r value <"$read_from" + if [[ -z "$value" && -n "$default" ]]; then + value="$default" + fi + + eval "$var_name='$value'" +} + +# Read a secret value into a variable (no echo). +read_secret_var() { + local prompt=$1 + local var_name=$2 + local value="" + local read_from="/dev/tty" + local write_to="/dev/tty" + + if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then + read_from="/dev/stdin" + write_to="/dev/stdout" + fi + + printf "%s: " "$prompt" >"$write_to" + stty -echo <"$read_from" + IFS= read -r value <"$read_from" + stty echo <"$read_from" + printf "\n" >"$write_to" + + eval "$var_name='$value'" +} + +# Check if command exists +check_command() { + command -v "$1" &>/dev/null +} + +# Retry with exponential backoff +retry_with_backoff() { + local max_attempts=${1:-5} + local delay=${2:-2} + local max_delay=${3:-60} + shift 3 + local cmd=("$@") + + local attempt=1 + while [[ $attempt -le $max_attempts ]]; do + log_info "Attempt $attempt/$max_attempts: ${cmd[*]}" + if "${cmd[@]}"; then + return 0 + fi + + if [[ $attempt -lt $max_attempts ]]; then + log_warning "Failed, retrying in ${delay}s..." + sleep "$delay" + delay=$((delay * 2)) + if [[ $delay -gt $max_delay ]]; then + delay=$max_delay + fi + fi + ((attempt++)) + done + + log_error "All $max_attempts attempts failed" + return 1 +} + +# Wait for a condition with timeout +wait_for_condition() { + local description=$1 + local timeout=${2:-300} + local interval=${3:-10} + shift 3 + local cmd=("$@") + + log_info "Waiting for $description (timeout: ${timeout}s)..." + + local elapsed=0 + while [[ $elapsed -lt $timeout ]]; do + if "${cmd[@]}" &>/dev/null; then + log_success "$description" + return 0 + fi + sleep "$interval" + ((elapsed += interval)) + echo -n "." + done + + echo "" + log_error "Timeout waiting for $description" + return 1 +} + +# Check kubectl connection +check_kubectl() { + if ! check_command kubectl; then + log_error "kubectl not found" + return 1 + fi + + if ! kubectl cluster-info &>/dev/null; then + log_error "Cannot connect to Kubernetes cluster" + return 1 + fi + + log_success "kubectl connected to cluster" + return 0 +} + +# Check Helm +check_helm() { + if ! check_command helm; then + log_error "helm not found" + return 1 + fi + + log_success "helm available" + return 0 +} + +# Install Helm chart with retry +helm_install() { + local name=$1 + local chart=$2 + local namespace=$3 + shift 3 + local extra_args=("$@") + + log_info "Installing Helm chart: $name" + + kubectl create namespace "$namespace" --dry-run=client -o yaml | kubectl apply -f - + + retry_with_backoff 3 5 30 helm upgrade --install "$name" "$chart" \ + --namespace "$namespace" \ + --wait --timeout 10m \ + "${extra_args[@]}" +} + +# Wait for pods to be ready +wait_for_pods() { + local namespace=$1 + local label_selector=$2 + local timeout=${3:-300} + + wait_for_condition "pods with label $label_selector in $namespace" \ + "$timeout" 10 \ + kubectl wait --for=condition=Ready pods \ + -n "$namespace" \ + -l "$label_selector" \ + --timeout=10s +} + +# Detect OSMO service URL from the NGINX Ingress Controller's LoadBalancer. +# +# When OSMO_TLS_ENABLED=true and OSMO_INGRESS_HOSTNAME is set, returns +# https://. Otherwise falls back to http://. +# +# Lookup order: +# 0. If TLS enabled + hostname set, return https:// immediately +# 1. LoadBalancer external IP (cloud assigns a public/internal IP) +# 2. LoadBalancer hostname (some clouds return a DNS name instead) +# 3. Controller ClusterIP (fallback – works from inside the cluster) +# +# Usage: +# url=$(detect_service_url) +# [[ -n "$url" ]] && echo "OSMO reachable at $url" +detect_service_url() { + local ns="${INGRESS_NAMESPACE:-ingress-nginx}" + local tls_enabled="${OSMO_TLS_ENABLED:-false}" + local hostname="${OSMO_INGRESS_HOSTNAME:-}" + local scheme="http" + + if [[ "$tls_enabled" == "true" ]]; then + scheme="https" + # If hostname is configured, prefer it (TLS certs are issued for the domain) + if [[ -n "$hostname" ]]; then + echo "${scheme}://${hostname}" + return 0 + fi + fi + + # Find the controller service (works for the community ingress-nginx chart) + local lb_ip lb_host cluster_ip svc_name + svc_name=$(kubectl get svc -n "$ns" \ + -l app.kubernetes.io/name=ingress-nginx,app.kubernetes.io/component=controller \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) + + if [[ -n "$svc_name" ]]; then + # 1. LoadBalancer IP + lb_ip=$(kubectl get svc "$svc_name" -n "$ns" \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + if [[ -n "$lb_ip" ]]; then + echo "${scheme}://${lb_ip}" + return 0 + fi + + # 2. LoadBalancer hostname (e.g. ELB on AWS) + lb_host=$(kubectl get svc "$svc_name" -n "$ns" \ + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || true) + if [[ -n "$lb_host" ]]; then + echo "${scheme}://${lb_host}" + return 0 + fi + + # 3. ClusterIP of the controller + cluster_ip=$(kubectl get svc "$svc_name" -n "$ns" \ + -o jsonpath='{.spec.clusterIP}' 2>/dev/null || true) + if [[ -n "$cluster_ip" && "$cluster_ip" != "None" ]]; then + echo "${scheme}://${cluster_ip}" + return 0 + fi + fi + + # Nothing found + return 1 +} + +# Get Terraform output (supports nested values like "postgresql.host") +get_tf_output() { + local name=$1 + local tf_dir=${2:-../001-iac} + + # Check if name contains a dot (nested value) + if [[ "$name" == *.* ]]; then + local base_name="${name%%.*}" + local key="${name#*.}" + terraform -chdir="$tf_dir" output -json "$base_name" 2>/dev/null | jq -r ".$key // empty" + else + terraform -chdir="$tf_dir" output -json "$name" 2>/dev/null | jq -r '. // empty' + fi +} + +# Get Nebius CLI path +get_nebius_path() { + if command -v nebius &>/dev/null; then + command -v nebius + elif [[ -x "$HOME/.nebius/bin/nebius" ]]; then + echo "$HOME/.nebius/bin/nebius" + fi +} + +# Read secret from Nebius MysteryBox +# Usage: get_mysterybox_secret +# Returns the secret value or empty string if not found +get_mysterybox_secret() { + local secret_id=$1 + local key=$2 + local nebius_path=$(get_nebius_path) + + if [[ -z "$nebius_path" ]]; then + log_warning "Nebius CLI not found, cannot read from MysteryBox" + return 1 + fi + + if [[ -z "$secret_id" ]]; then + return 1 + fi + + local result=$("$nebius_path" mysterybox v1 payload get-by-key \ + --secret-id "$secret_id" \ + --key "$key" \ + --format json 2>/dev/null) + + if [[ -n "$result" ]]; then + echo "$result" | jq -r '.data.string_value // empty' 2>/dev/null + fi +} + +# ----------------------------------------------------------------------------- +# OSMO API helpers (for use when Envoy auth sidecar is present) +# ----------------------------------------------------------------------------- +# Per OSMO documentation, the OSMO service authorises requests by reading +# the x-osmo-user and x-osmo-roles headers. Envoy normally sets these from +# the JWT but when we bypass Envoy (port-forward to pod:8000) we must set +# them ourselves. +# +# Reference: https://nvidia.github.io/OSMO/main/deployment_guide/appendix/authentication/authentication_flow.html + +# Detect if a pod has an Envoy sidecar container +# Usage: has_envoy_sidecar +# Returns 0 (true) if envoy container is found, 1 (false) otherwise +has_envoy_sidecar() { + local ns="$1" + local label="$2" + local pod_name + pod_name=$(kubectl get pod -n "$ns" -l "$label" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) + if [[ -z "$pod_name" ]]; then + return 1 + fi + kubectl get pod -n "$ns" "$pod_name" -o jsonpath='{.spec.containers[*].name}' 2>/dev/null | grep -q envoy +} + +# Start a port-forward that bypasses Envoy when the sidecar is present. +# Sets PORT_FORWARD_PID and prints log messages. +# Usage: start_osmo_port_forward [local_port] +start_osmo_port_forward() { + local ns="${1:-osmo}" + local local_port="${2:-8080}" + + if has_envoy_sidecar "$ns" "app=osmo-service"; then + local pod_name + pod_name=$(kubectl get pod -n "$ns" -l app=osmo-service -o jsonpath='{.items[0].metadata.name}') + log_info "Envoy sidecar detected -- port-forwarding to pod/${pod_name}:8000 (bypassing auth)..." + kubectl port-forward -n "$ns" "pod/${pod_name}" "${local_port}:8000" &>/dev/null & + _OSMO_AUTH_BYPASS=true + else + log_info "No Envoy sidecar -- port-forwarding to svc/osmo-service:80..." + kubectl port-forward -n "$ns" svc/osmo-service "${local_port}:80" &>/dev/null & + _OSMO_AUTH_BYPASS=false + fi + PORT_FORWARD_PID=$! + export _OSMO_AUTH_BYPASS +} + +# Make an authenticated curl call to the OSMO API. +# When _OSMO_AUTH_BYPASS=true (Envoy bypassed), injects x-osmo-user and +# x-osmo-roles headers so the OSMO service authorises the request. +# Usage: osmo_curl [curl-args...] +# Example: osmo_curl GET "http://localhost:8080/api/configs/service" +# Example: osmo_curl PATCH "http://localhost:8080/api/configs/service" -d '{"configs_dict":{...}}' +osmo_curl() { + local method="$1"; shift + local url="$1"; shift + + local auth_args=() + if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then + auth_args+=(-H "x-osmo-user: osmo-admin" -H "x-osmo-roles: osmo-admin,osmo-user") + fi + + curl -s -X "$method" "$url" \ + -H "Content-Type: application/json" \ + "${auth_args[@]}" \ + "$@" +} + +# Log in to OSMO using the appropriate method. +# When bypassing Envoy this is a no-op (curl headers handle auth). +# Otherwise uses `osmo login --method dev`. +# Usage: osmo_login [port] +osmo_login() { + local port="${1:-8080}" + if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then + log_info "Auth bypass active -- using direct API headers (osmo-admin role)" + else + log_info "Logging in to OSMO..." + if ! osmo login "http://localhost:${port}" --method dev --username admin 2>/dev/null; then + log_error "Failed to login to OSMO" + return 1 + fi + log_success "Logged in successfully" + fi +} + +# Update an OSMO config via the PATCH API (partial merge). +# When _OSMO_AUTH_BYPASS=true, uses curl; otherwise uses osmo CLI. +# Usage: osmo_config_update +# Example: osmo_config_update WORKFLOW /tmp/config.json "Configure storage" +osmo_config_update() { + local config_type="$1" + local json_file="$2" + local description="${3:-Update config}" + local port="${4:-8080}" + + if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then + local endpoint + endpoint="api/configs/$(echo "$config_type" | tr '[:upper:]' '[:lower:]')" + + # Build PATCH request body: {"description": "...", "configs_dict": } + local body + body=$(jq -n --arg desc "$description" --slurpfile cfg "$json_file" \ + '{description: $desc, configs_dict: $cfg[0]}') + + local http_code + http_code=$(osmo_curl PATCH "http://localhost:${port}/${endpoint}" \ + -d "$body" -o /tmp/_osmo_patch_resp.txt -w "%{http_code}") + + if [[ "$http_code" =~ ^2 ]]; then + return 0 + else + log_error "PATCH /${endpoint} returned HTTP ${http_code}" + cat /tmp/_osmo_patch_resp.txt 2>/dev/null || true + return 1 + fi + else + osmo config update "$config_type" --file "$json_file" --description "$description" 2>/dev/null + fi +} diff --git a/applications/osmo/deploy/002a-setup/osmo-values-noauth.yaml b/applications/osmo/deploy/002a-setup/osmo-values-noauth.yaml new file mode 100755 index 000000000..53eb46662 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/osmo-values-noauth.yaml @@ -0,0 +1,170 @@ +# OSMO Service values - Auth Disabled +# For testing without authentication + +global: + osmoImageLocation: nvcr.io/nvidia/osmo + osmoImageTag: latest + imagePullPolicy: IfNotPresent + +services: + postgres: + enabled: false + serviceName: postgresql.osmo.svc.cluster.local + port: 5432 + db: osmo + user: osmo_admin + passwordSecretName: postgres-secret + passwordSecretKey: password + + redis: + enabled: false + serviceName: redis-master.osmo.svc.cluster.local + port: 6379 + tlsEnabled: false + + service: + scaling: + minReplicas: 1 + maxReplicas: 1 + ingress: + enabled: false + auth: + enabled: false + extraEnv: + - name: OSMO_POSTGRES_HOST + value: postgresql.osmo.svc.cluster.local + - name: OSMO_POSTGRES_PORT + value: "5432" + - name: OSMO_POSTGRES_USER + value: osmo_admin + - name: OSMO_POSTGRES_DATABASE + value: osmo + - name: OSMO_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + extraVolumes: + - name: vault-secrets + secret: + secretName: vault-secrets + extraVolumeMounts: + - name: vault-secrets + mountPath: /home/osmo/vault-agent/secrets + readOnly: true + + worker: + scaling: + minReplicas: 1 + maxReplicas: 1 + extraEnv: + - name: OSMO_POSTGRES_HOST + value: postgresql.osmo.svc.cluster.local + - name: OSMO_POSTGRES_PORT + value: "5432" + - name: OSMO_POSTGRES_USER + value: osmo_admin + - name: OSMO_POSTGRES_DATABASE + value: osmo + - name: OSMO_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + extraVolumes: + - name: vault-secrets + secret: + secretName: vault-secrets + extraVolumeMounts: + - name: vault-secrets + mountPath: /home/osmo/vault-agent/secrets + readOnly: true + + logger: + scaling: + minReplicas: 1 + maxReplicas: 1 + extraEnv: + - name: OSMO_POSTGRES_HOST + value: postgresql.osmo.svc.cluster.local + - name: OSMO_POSTGRES_PORT + value: "5432" + - name: OSMO_POSTGRES_USER + value: osmo_admin + - name: OSMO_POSTGRES_DATABASE + value: osmo + - name: OSMO_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + extraVolumes: + - name: vault-secrets + secret: + secretName: vault-secrets + extraVolumeMounts: + - name: vault-secrets + mountPath: /home/osmo/vault-agent/secrets + readOnly: true + + agent: + scaling: + minReplicas: 1 + maxReplicas: 1 + extraEnv: + - name: OSMO_POSTGRES_HOST + value: postgresql.osmo.svc.cluster.local + - name: OSMO_POSTGRES_PORT + value: "5432" + - name: OSMO_POSTGRES_USER + value: osmo_admin + - name: OSMO_POSTGRES_DATABASE + value: osmo + - name: OSMO_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + extraVolumes: + - name: vault-secrets + secret: + secretName: vault-secrets + extraVolumeMounts: + - name: vault-secrets + mountPath: /home/osmo/vault-agent/secrets + readOnly: true + + delayedJobMonitor: + replicas: 1 + extraEnv: + - name: OSMO_POSTGRES_HOST + value: postgresql.osmo.svc.cluster.local + - name: OSMO_POSTGRES_PORT + value: "5432" + - name: OSMO_POSTGRES_USER + value: osmo_admin + - name: OSMO_POSTGRES_DATABASE + value: osmo + - name: OSMO_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + extraVolumes: + - name: vault-secrets + secret: + secretName: vault-secrets + extraVolumeMounts: + - name: vault-secrets + mountPath: /home/osmo/vault-agent/secrets + readOnly: true + +sidecars: + envoy: + enabled: false + rateLimit: + enabled: false + logAgent: + enabled: false + otel: + enabled: false diff --git a/applications/osmo/deploy/002a-setup/sample_osmo_realm.json b/applications/osmo/deploy/002a-setup/sample_osmo_realm.json new file mode 100755 index 000000000..54a65ed77 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/sample_osmo_realm.json @@ -0,0 +1,2636 @@ +{ + "id": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "realm": "osmo", + "notBefore": 0, + "defaultSignatureAlgorithm": "RS256", + "revokeRefreshToken": false, + "refreshTokenMaxReuse": 0, + "accessTokenLifespan": 300, + "accessTokenLifespanForImplicitFlow": 900, + "ssoSessionIdleTimeout": 604800, + "ssoSessionMaxLifespan": 604800, + "ssoSessionIdleTimeoutRememberMe": 0, + "ssoSessionMaxLifespanRememberMe": 0, + "offlineSessionIdleTimeout": 2592000, + "offlineSessionMaxLifespanEnabled": false, + "offlineSessionMaxLifespan": 5184000, + "clientSessionIdleTimeout": 0, + "clientSessionMaxLifespan": 0, + "clientOfflineSessionIdleTimeout": 0, + "clientOfflineSessionMaxLifespan": 0, + "accessCodeLifespan": 60, + "accessCodeLifespanUserAction": 300, + "accessCodeLifespanLogin": 1800, + "actionTokenGeneratedByAdminLifespan": 43200, + "actionTokenGeneratedByUserLifespan": 300, + "oauth2DeviceCodeLifespan": 600, + "oauth2DevicePollingInterval": 5, + "enabled": true, + "sslRequired": "external", + "registrationAllowed": false, + "registrationEmailAsUsername": false, + "rememberMe": false, + "verifyEmail": false, + "loginWithEmailAllowed": false, + "duplicateEmailsAllowed": false, + "resetPasswordAllowed": false, + "editUsernameAllowed": false, + "bruteForceProtected": true, + "permanentLockout": false, + "maxTemporaryLockouts": 0, + "bruteForceStrategy": "MULTIPLE", + "maxFailureWaitSeconds": 300, + "minimumQuickLoginWaitSeconds": 60, + "waitIncrementSeconds": 60, + "quickLoginCheckMilliSeconds": 1000, + "maxDeltaTimeSeconds": 43200, + "failureFactor": 30, + "roles": { + "realm": [ + { + "id": "2fbf71d8-d3c1-4de3-8c08-ae55b254e092", + "name": "uma_authorization", + "description": "${role_uma_authorization}", + "composite": false, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "attributes": {} + }, + { + "id": "e22b93a7-88eb-4f66-a5cc-7c68a35d72fb", + "name": "offline_access", + "description": "${role_offline-access}", + "composite": false, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "attributes": {} + }, + { + "id": "c3d524ce-b3c8-42fd-9e6b-777a32960bb2", + "name": "admin", + "description": "${role_admin}", + "composite": true, + "composites": { + "realm": [ + "create-realm" + ], + "client": { + "realm-management": [ + "manage-realm", + "query-clients", + "view-users", + "manage-identity-providers", + "impersonation", + "view-events", + "manage-authorization", + "query-realms", + "manage-clients", + "view-clients", + "create-client", + "query-groups", + "view-identity-providers", + "view-realm", + "view-authorization", + "manage-users", + "query-users", + "manage-events" + ] + } + }, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "attributes": {} + }, + { + "id": "996ba034-02ae-40d4-8d14-735506151057", + "name": "default-roles-osmo", + "description": "${role_default-roles}", + "composite": true, + "composites": { + "realm": [ + "offline_access", + "uma_authorization" + ], + "client": { + "account": [ + "manage-account", + "view-profile" + ] + } + }, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "attributes": {} + }, + { + "id": "f5584dff-7c44-4204-b387-e3caf8ea3f46", + "name": "create-realm", + "description": "${role_create-realm}", + "composite": false, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", + "attributes": {} + } + ], + "client": { + "osmo-realm": [], + "realm-management": [ + { + "id": "b8b96d4c-fc77-4e20-bc64-4918144dfdcf", + "name": "manage-realm", + "description": "${role_manage-realm}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "1dbd9f8f-e5e6-41b3-ba7c-746835fd9b79", + "name": "query-clients", + "description": "${role_query-clients}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "d27fc846-afad-42f9-8b11-636f4c535a36", + "name": "view-users", + "description": "${role_view-users}", + "composite": true, + "composites": { + "client": { + "realm-management": [ + "query-groups", + "query-users" + ] + } + }, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "3c345b77-4bdb-4360-bf81-fe85a77cbff7", + "name": "manage-identity-providers", + "description": "${role_manage-identity-providers}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "4953639a-2db7-45d7-a734-c42b487647c5", + "name": "impersonation", + "description": "${role_impersonation}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "ae14995a-6e23-4b1d-a10d-dd0feebf1d4a", + "name": "view-events", + "description": "${role_view-events}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "5ae16954-f8ad-4237-be92-1eb6916ce6cb", + "name": "manage-authorization", + "description": "${role_manage-authorization}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "7663ba0a-60f3-46bb-9232-3a2cc1832e62", + "name": "query-realms", + "description": "${role_query-realms}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "121f50ad-06c7-4541-a40f-400710228515", + "name": "manage-clients", + "description": "${role_manage-clients}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "d8c6a12c-240c-415c-9299-30f5292d2b90", + "name": "view-clients", + "description": "${role_view-clients}", + "composite": true, + "composites": { + "client": { + "realm-management": [ + "query-clients" + ] + } + }, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "70ebf14f-cf79-4ad7-b4c4-3d5289288ec0", + "name": "create-client", + "description": "${role_create-client}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "1abf94ab-c2a7-469c-b081-584fbbb66046", + "name": "query-groups", + "description": "${role_query-groups}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "f8e1d204-7b77-446a-84fb-675c8c85e1f1", + "name": "realm-admin", + "description": "${role_realm-admin}", + "composite": true, + "composites": { + "client": { + "realm-management": [ + "manage-realm", + "query-clients", + "view-users", + "manage-identity-providers", + "impersonation", + "view-events", + "manage-authorization", + "query-realms", + "manage-clients", + "view-clients", + "create-client", + "query-groups", + "view-identity-providers", + "view-realm", + "view-authorization", + "manage-users", + "query-users", + "manage-events" + ] + } + }, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "72066e7f-f80f-4008-a0b3-531d3aebd2f0", + "name": "view-identity-providers", + "description": "${role_view-identity-providers}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "29649597-fdc9-4330-a96d-94218a1e91b2", + "name": "view-realm", + "description": "${role_view-realm}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "12c80e9d-c3d9-4e61-91ab-c986e3aafe48", + "name": "view-authorization", + "description": "${role_view-authorization}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "bde16849-39b1-4c85-985d-40e9a178e873", + "name": "manage-users", + "description": "${role_manage-users}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "62463d22-8113-41e0-af6a-fa81883c475d", + "name": "query-users", + "description": "${role_query-users}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + }, + { + "id": "e1afbd19-239f-4e78-abd9-5019b6baa7e2", + "name": "manage-events", + "description": "${role_manage-events}", + "composite": false, + "clientRole": true, + "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "attributes": {} + } + ], + "osmo-browser-flow": [ + { + "id": "2cfce9e9-000e-4de8-a0b6-50f0a4252db3", + "name": "dashboard-admin", + "description": "Able to make change to the kubernetes dashboard", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + }, + { + "id": "454726d1-4f76-47f6-bcfa-5d64f759134f", + "name": "grafana-user", + "description": "Able to view dashboards in grafana", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + }, + { + "id": "9d91ae54-e69b-46e8-baee-7a16f044ded1", + "name": "osmo-user", + "description": "A regular user of osmo who can submit and query workflows and datasets", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + }, + { + "id": "9ec3a04d-49a4-414b-9e2f-35b70bbea18b", + "name": "dashboard-user", + "description": "Able to view the kubernetes dashboard", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + }, + { + "id": "dfd62581-88c7-4ebb-beac-7555d1aef105", + "name": "grafana-admin", + "description": "", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + }, + { + "id": "aa86ac92-9df4-499c-9f78-e3ed600ddc15", + "name": "osmo-admin", + "description": "Admin access to the osmo service", + "composite": false, + "clientRole": true, + "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "attributes": {} + } + ], + "security-admin-console": [], + "admin-cli": [], + "account-console": [], + "broker": [ + { + "id": "44300967-5867-4c57-a59a-5b8302cb8323", + "name": "read-token", + "description": "${role_read-token}", + "composite": false, + "clientRole": true, + "containerId": "6fdf7b8e-1146-4dd9-a3dc-dd93e877cf2a", + "attributes": {} + } + ], + "osmo-device": [ + { + "id": "e126038f-20eb-4d31-a95b-e5267eb8c7f1", + "name": "osmo-user", + "description": "", + "composite": false, + "clientRole": true, + "containerId": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", + "attributes": {} + }, + { + "id": "20874405-f96b-456b-a3b8-86cfe8740144", + "name": "osmo-admin", + "description": "Admin access to the osmo service", + "composite": false, + "clientRole": true, + "containerId": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", + "attributes": {} + }, + { + "id": "94a41f7f-9927-489f-aa76-a9e3dafb4ed5", + "name": "osmo-backend", + "description": "", + "composite": false, + "clientRole": true, + "containerId": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", + "attributes": {} + } + ], + "account": [ + { + "id": "358c4e88-41b8-458b-83d9-e4c86a357095", + "name": "manage-account-links", + "description": "${role_manage-account-links}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "499f54a7-ccc5-4fef-bece-9ccdc6a80308", + "name": "manage-consent", + "description": "${role_manage-consent}", + "composite": true, + "composites": { + "client": { + "account": [ + "view-consent" + ] + } + }, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "f14ea475-e733-4f69-8475-693da2992a72", + "name": "view-applications", + "description": "${role_view-applications}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "aea168f8-7115-468b-9118-aae87937dee9", + "name": "view-consent", + "description": "${role_view-consent}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "47acd969-e55d-4382-946b-67fb2e4bb119", + "name": "manage-account", + "description": "${role_manage-account}", + "composite": true, + "composites": { + "client": { + "account": [ + "manage-account-links" + ] + } + }, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "102cd4a5-8e95-4d3c-87de-a98c2958f5c0", + "name": "view-groups", + "description": "${role_view-groups}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "b6da542f-977e-437e-8d24-6cb4ed4612af", + "name": "delete-account", + "description": "${role_delete-account}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + }, + { + "id": "2da758ad-a74d-43ef-b911-6b52c8b60d90", + "name": "view-profile", + "description": "${role_view-profile}", + "composite": false, + "clientRole": true, + "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", + "attributes": {} + } + ] + } + }, + "groups": [ + { + "id": "979a1cd5-b392-4905-a868-17603faf9ca9", + "name": "Admin", + "path": "/Admin", + "subGroups": [], + "attributes": {}, + "realmRoles": [], + "clientRoles": { + "osmo-browser-flow": [ + "osmo-user", + "osmo-admin" + ], + "osmo-device": [ + "osmo-user", + "osmo-admin" + ] + } + }, + { + "id": "2fc39861-b636-47c8-b57b-d1719466759c", + "name": "Backend Operator", + "path": "/Backend Operator", + "subGroups": [], + "attributes": {}, + "realmRoles": [], + "clientRoles": { + "osmo-device": [ + "osmo-backend" + ] + } + }, + { + "id": "57a9b7f0-36ec-46c5-9781-49d53b1c6468", + "name": "User", + "path": "/User", + "subGroups": [], + "attributes": {}, + "realmRoles": [], + "clientRoles": { + "osmo-browser-flow": [ + "osmo-user", + "grafana-user", + "dashboard-user" + ], + "osmo-device": [ + "osmo-user" + ] + } + } + ], + "defaultRole": { + "id": "996ba034-02ae-40d4-8d14-735506151057", + "name": "default-roles-osmo", + "description": "${role_default-roles}", + "composite": true, + "clientRole": false, + "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c" + }, + "requiredCredentials": [ + "password" + ], + "otpPolicyType": "totp", + "otpPolicyAlgorithm": "HmacSHA1", + "otpPolicyInitialCounter": 0, + "otpPolicyDigits": 6, + "otpPolicyLookAheadWindow": 1, + "otpPolicyPeriod": 30, + "otpPolicyCodeReusable": false, + "otpSupportedApplications": [ + "totpAppFreeOTPName", + "totpAppGoogleName", + "totpAppMicrosoftAuthenticatorName" + ], + "localizationTexts": {}, + "webAuthnPolicyRpEntityName": "keycloak", + "webAuthnPolicySignatureAlgorithms": [ + "ES256" + ], + "webAuthnPolicyRpId": "", + "webAuthnPolicyAttestationConveyancePreference": "not specified", + "webAuthnPolicyAuthenticatorAttachment": "not specified", + "webAuthnPolicyRequireResidentKey": "not specified", + "webAuthnPolicyUserVerificationRequirement": "not specified", + "webAuthnPolicyCreateTimeout": 0, + "webAuthnPolicyAvoidSameAuthenticatorRegister": false, + "webAuthnPolicyAcceptableAaguids": [], + "webAuthnPolicyExtraOrigins": [], + "webAuthnPolicyPasswordlessRpEntityName": "keycloak", + "webAuthnPolicyPasswordlessSignatureAlgorithms": [ + "ES256" + ], + "webAuthnPolicyPasswordlessRpId": "", + "webAuthnPolicyPasswordlessAttestationConveyancePreference": "not specified", + "webAuthnPolicyPasswordlessAuthenticatorAttachment": "not specified", + "webAuthnPolicyPasswordlessRequireResidentKey": "not specified", + "webAuthnPolicyPasswordlessUserVerificationRequirement": "not specified", + "webAuthnPolicyPasswordlessCreateTimeout": 0, + "webAuthnPolicyPasswordlessAvoidSameAuthenticatorRegister": false, + "webAuthnPolicyPasswordlessAcceptableAaguids": [], + "webAuthnPolicyPasswordlessExtraOrigins": [], + "scopeMappings": [ + { + "clientScope": "offline_access", + "roles": [ + "offline_access" + ] + } + ], + "clientScopeMappings": { + "account": [ + { + "client": "account-console", + "roles": [ + "manage-account", + "view-groups" + ] + } + ] + }, + "clients": [ + { + "id": "049b45a3-ba14-4735-8168-c9be73625a6f", + "clientId": "account", + "name": "${client_account}", + "rootUrl": "${authBaseUrl}", + "baseUrl": "/realms/osmo/account/", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [ + "/realms/osmo/account/*" + ], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": true, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "post.logout.redirect.uris": "+" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": false, + "nodeReRegistrationTimeout": 0, + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "a18dadb1-a13d-4523-8e33-446ff5781676", + "clientId": "account-console", + "name": "${client_account-console}", + "rootUrl": "${authBaseUrl}", + "baseUrl": "/realms/osmo/account/", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [ + "/realms/osmo/account/*" + ], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": true, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "post.logout.redirect.uris": "+", + "pkce.code.challenge.method": "S256" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": false, + "nodeReRegistrationTimeout": 0, + "protocolMappers": [ + { + "id": "d3db99fd-64a1-48b8-82bd-a92533e2fd4c", + "name": "audience resolve", + "protocol": "openid-connect", + "protocolMapper": "oidc-audience-resolve-mapper", + "consentRequired": false, + "config": {} + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "14047566-1501-4403-92c7-418ef38e3ba4", + "clientId": "admin-cli", + "name": "${client_admin-cli}", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": false, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": true, + "serviceAccountsEnabled": false, + "publicClient": true, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "client.use.lightweight.access.token.enabled": "true", + "post.logout.redirect.uris": "+" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": 0, + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "6fdf7b8e-1146-4dd9-a3dc-dd93e877cf2a", + "clientId": "broker", + "name": "${client_broker}", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": true, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": false, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "true", + "post.logout.redirect.uris": "+" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": false, + "nodeReRegistrationTimeout": 0, + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", + "clientId": "osmo-browser-flow", + "name": "Osmo Browser Flow", + "description": "Allow logging into osmo using the authorization code based browser flow", + "rootUrl": "https://default.com", + "adminUrl": "https://default.com", + "baseUrl": "https://default.com/docs", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "secret": "**********", + "redirectUris": [ + "", + "https://default.com/setup/getAToken", + "https://default.com/getAToken", + "https://default.com/api/auth/getAToken" + ], + "webOrigins": [ + "*", + "https://default.com" + ], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": false, + "frontchannelLogout": true, + "protocol": "openid-connect", + "attributes": { + "client.secret.creation.time": "1762965594", + "post.logout.redirect.uris": "+", + "frontchannel.logout.session.required": "true", + "oauth2.device.authorization.grant.enabled": "false", + "backchannel.logout.revoke.offline.tokens": "false", + "use.refresh.tokens": "true", + "realm_client": "false", + "oidc.ciba.grant.enabled": "false", + "backchannel.logout.session.required": "true", + "client_credentials.use_refresh_token": "false", + "acr.loa.map": "{}", + "require.pushed.authorization.requests": "false", + "tls.client.certificate.bound.access.tokens": "false", + "display.on.consent.screen": "false", + "token.response.type.bearer.lower-case": "false" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": -1, + "protocolMappers": [ + { + "id": "8fcbb19c-503b-4173-a35b-69cc23bc112f", + "name": "Create \"roles\" claim", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-client-role-mapper", + "consentRequired": false, + "config": { + "multivalued": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "roles", + "jsonType.label": "String", + "usermodel.clientRoleMapping.clientId": "osmo-browser-flow" + } + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", + "clientId": "osmo-device", + "name": "Osmo device flow", + "description": "Allow login with devices such as cli", + "rootUrl": "https://default.com", + "adminUrl": "https://default.com", + "baseUrl": "https://default.com", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [ + "https://default.com/*" + ], + "webOrigins": [ + "https://default.com" + ], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": false, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": true, + "serviceAccountsEnabled": false, + "publicClient": true, + "frontchannelLogout": true, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "oidc.ciba.grant.enabled": "false", + "backchannel.logout.session.required": "true", + "post.logout.redirect.uris": "+", + "frontchannel.logout.session.required": "true", + "display.on.consent.screen": "false", + "oauth2.device.authorization.grant.enabled": "true", + "backchannel.logout.revoke.offline.tokens": "false" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": -1, + "protocolMappers": [ + { + "id": "21f8be09-ffc5-4a26-855b-6be4ab297c67", + "name": "Create \"roles\" claim", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-client-role-mapper", + "consentRequired": false, + "config": { + "multivalued": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "roles", + "jsonType.label": "String", + "usermodel.clientRoleMapping.clientId": "osmo-device" + } + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "06a0fe4b-c247-4233-af67-78138bf5337a", + "clientId": "osmo-realm", + "name": "OSMO Realm", + "description": "", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": true, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": false, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "oidc.ciba.grant.enabled": "false", + "backchannel.logout.session.required": "true", + "post.logout.redirect.uris": "+", + "oauth2.device.authorization.grant.enabled": "false", + "backchannel.logout.revoke.offline.tokens": "false" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": false, + "nodeReRegistrationTimeout": 0, + "defaultClientScopes": [], + "optionalClientScopes": [] + }, + { + "id": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", + "clientId": "realm-management", + "name": "${client_realm-management}", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": true, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": false, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "true", + "post.logout.redirect.uris": "+" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": false, + "nodeReRegistrationTimeout": 0, + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "c70e9b76-96a2-41da-84da-df8b9e0d228d", + "clientId": "security-admin-console", + "name": "${client_security-admin-console}", + "rootUrl": "${authAdminUrl}", + "baseUrl": "/admin/osmo/console/", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": false, + "clientAuthenticatorType": "client-secret", + "redirectUris": [ + "/admin/osmo/console/*" + ], + "webOrigins": [ + "+" + ], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": true, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "publicClient": true, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "realm_client": "false", + "client.use.lightweight.access.token.enabled": "true", + "post.logout.redirect.uris": "+", + "pkce.code.challenge.method": "S256" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": 0, + "protocolMappers": [ + { + "id": "e921764f-2d7f-4a08-833c-204801a096db", + "name": "locale", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "locale", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "locale", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "profile", + "roles", + "basic", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + } + ], + "clientScopes": [ + { + "id": "e172a6de-ad7d-4cbd-be06-010d284b6806", + "name": "basic", + "description": "OpenID Connect scope for add all basic claims to the token", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "false", + "display.on.consent.screen": "false" + }, + "protocolMappers": [ + { + "id": "e67f2d9e-7cf0-4875-a72d-ce4a086adf7b", + "name": "auth_time", + "protocol": "openid-connect", + "protocolMapper": "oidc-usersessionmodel-note-mapper", + "consentRequired": false, + "config": { + "user.session.note": "AUTH_TIME", + "introspection.token.claim": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "auth_time", + "jsonType.label": "long" + } + }, + { + "id": "eba73e8f-7d13-46c7-9e6e-44e8839b1022", + "name": "sub", + "protocol": "openid-connect", + "protocolMapper": "oidc-sub-mapper", + "consentRequired": false, + "config": { + "access.token.claim": "true", + "introspection.token.claim": "true" + } + } + ] + }, + { + "id": "76307a43-d2c9-40df-a686-6c4c10e0f70d", + "name": "address", + "description": "OpenID Connect built-in scope: address", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "true", + "consent.screen.text": "${addressScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "32ac1e8f-3680-4c50-8bb4-7eed44c679b1", + "name": "address", + "protocol": "openid-connect", + "protocolMapper": "oidc-address-mapper", + "consentRequired": false, + "config": { + "user.attribute.formatted": "formatted", + "user.attribute.country": "country", + "user.attribute.postal_code": "postal_code", + "userinfo.token.claim": "true", + "user.attribute.street": "street", + "id.token.claim": "true", + "user.attribute.region": "region", + "access.token.claim": "true", + "user.attribute.locality": "locality" + } + } + ] + }, + { + "id": "67a444ee-3246-4878-a525-e0015e9b31cb", + "name": "offline_access", + "description": "OpenID Connect built-in scope: offline_access", + "protocol": "openid-connect", + "attributes": { + "consent.screen.text": "${offlineAccessScopeConsentText}", + "display.on.consent.screen": "true" + } + }, + { + "id": "1e8f098a-66fe-4df2-9547-47be0d040c53", + "name": "email", + "description": "OpenID Connect built-in scope: email", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "true", + "consent.screen.text": "${emailScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "00e95ac6-b825-4180-9558-4dffeac9584a", + "name": "email", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "email", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "email", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "9f5125d5-3b89-4f0f-a13e-b8fbb4d6afc1", + "name": "email verified", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-property-mapper", + "consentRequired": false, + "config": { + "user.attribute": "emailVerified", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "email_verified", + "jsonType.label": "boolean", + "userinfo.token.claim": "true" + } + } + ] + }, + { + "id": "988f9517-5cd2-4b66-90ba-3399d667d0f8", + "name": "role_list", + "description": "SAML role list", + "protocol": "saml", + "attributes": { + "consent.screen.text": "${samlRoleListScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "b78abf35-1108-40e2-a3c8-c6ea4200e817", + "name": "role list", + "protocol": "saml", + "protocolMapper": "saml-role-list-mapper", + "consentRequired": false, + "config": { + "single": "false", + "attribute.nameformat": "Basic", + "attribute.name": "Role" + } + } + ] + }, + { + "id": "f1dcc0f6-63be-4f85-a8cd-d43072e0eba4", + "name": "microprofile-jwt", + "description": "Microprofile - JWT built-in scope", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "true", + "display.on.consent.screen": "false" + }, + "protocolMappers": [ + { + "id": "bf488bdc-2622-45f0-95c2-df2d05fd3fab", + "name": "upn", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "username", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "upn", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "5aa8e8c1-f0d7-46c4-b2da-24aa9608da9f", + "name": "groups", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-realm-role-mapper", + "consentRequired": false, + "config": { + "multivalued": "true", + "userinfo.token.claim": "true", + "user.attribute": "foo", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "groups", + "jsonType.label": "String" + } + } + ] + }, + { + "id": "fe58e218-3aac-4780-8b5e-b61491cd457b", + "name": "profile", + "description": "OpenID Connect built-in scope: profile", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "true", + "consent.screen.text": "${profileScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "e0616aae-d3e0-4911-98b2-db72ad142938", + "name": "nickname", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "nickname", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "nickname", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "49cc1e1d-9401-4b57-b8a9-a37573f2eb06", + "name": "profile", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "profile", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "profile", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "e05eea05-f917-4ef3-a82f-501c82192bd6", + "name": "gender", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "gender", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "gender", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "89c031e1-bfad-4afd-af24-51db2c62a11f", + "name": "username", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "username", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "preferred_username", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "30d27d3e-3b72-49d1-a66f-0466b58dbf3b", + "name": "locale", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "locale", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "locale", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "9fc26d9e-c109-4b30-8ec2-2fc2d95b11d6", + "name": "picture", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "picture", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "picture", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "5c0dbd32-7a45-4dc9-9e4f-37570ebf5d38", + "name": "family name", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "lastName", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "family_name", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "2de0c290-124a-41be-b7d8-f61f63eed5ef", + "name": "full name", + "protocol": "openid-connect", + "protocolMapper": "oidc-full-name-mapper", + "consentRequired": false, + "config": { + "id.token.claim": "true", + "access.token.claim": "true", + "userinfo.token.claim": "true" + } + }, + { + "id": "369e67dd-fd5e-4d90-8d80-c945c7a0c049", + "name": "updated at", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "updatedAt", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "updated_at", + "jsonType.label": "long", + "userinfo.token.claim": "true" + } + }, + { + "id": "7557b943-11a1-42bb-a119-35e8da9fcb99", + "name": "birthdate", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "birthdate", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "birthdate", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "06359527-ce26-45f7-beba-7ccf5e71d6f5", + "name": "given name", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "firstName", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "given_name", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "8f3bfe54-a74a-4eed-b2bd-4157fc574b57", + "name": "middle name", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "middleName", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "middle_name", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "a6cbf817-a0f5-483d-ae1e-c716d04e1645", + "name": "website", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "website", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "website", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "1322fc37-04e4-4e89-99d4-6c304ad36c96", + "name": "zoneinfo", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "zoneinfo", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "zoneinfo", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + } + ] + }, + { + "id": "6aec68b8-7178-449d-9ba6-b6e1c2a9be73", + "name": "service_account", + "description": "Specific scope for a client enabled for service accounts", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "false", + "display.on.consent.screen": "false" + }, + "protocolMappers": [ + { + "id": "91715642-086a-493b-8f01-5c64d408b7e3", + "name": "Client ID", + "protocol": "openid-connect", + "protocolMapper": "oidc-usersessionmodel-note-mapper", + "consentRequired": false, + "config": { + "user.session.note": "client_id", + "introspection.token.claim": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "client_id", + "jsonType.label": "String" + } + }, + { + "id": "78dcf109-44bb-4aca-9540-a8896f26e864", + "name": "Client Host", + "protocol": "openid-connect", + "protocolMapper": "oidc-usersessionmodel-note-mapper", + "consentRequired": false, + "config": { + "user.session.note": "clientHost", + "introspection.token.claim": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "clientHost", + "jsonType.label": "String" + } + }, + { + "id": "e28a076d-9ee0-46ec-a2f0-a147bab66a09", + "name": "Client IP Address", + "protocol": "openid-connect", + "protocolMapper": "oidc-usersessionmodel-note-mapper", + "consentRequired": false, + "config": { + "user.session.note": "clientAddress", + "introspection.token.claim": "true", + "userinfo.token.claim": "true", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "clientAddress", + "jsonType.label": "String" + } + } + ] + }, + { + "id": "e728df12-1bff-418d-a68d-c2036d856db2", + "name": "roles", + "description": "OpenID Connect scope for add user roles to the access token", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "false", + "consent.screen.text": "${rolesScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "993f7f9d-55ba-4c1f-b84a-76e2c733bc94", + "name": "client roles", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-client-role-mapper", + "consentRequired": false, + "config": { + "user.attribute": "foo", + "access.token.claim": "true", + "claim.name": "resource_access.${client_id}.roles", + "jsonType.label": "String", + "multivalued": "true" + } + }, + { + "id": "f0b2b858-1cde-412b-a1c8-8ed3bd4e04d6", + "name": "realm roles", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-realm-role-mapper", + "consentRequired": false, + "config": { + "user.attribute": "foo", + "access.token.claim": "true", + "claim.name": "realm_access.roles", + "jsonType.label": "String", + "multivalued": "true" + } + }, + { + "id": "32ad3286-1486-4196-9232-533af4c10009", + "name": "audience resolve", + "protocol": "openid-connect", + "protocolMapper": "oidc-audience-resolve-mapper", + "consentRequired": false, + "config": {} + } + ] + }, + { + "id": "efee9fbd-1a06-41d4-94d1-16b59f8d9a68", + "name": "web-origins", + "description": "OpenID Connect scope for add allowed web origins to the access token", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "false", + "consent.screen.text": "", + "display.on.consent.screen": "false" + }, + "protocolMappers": [ + { + "id": "61110fbc-75c7-40cd-aca2-9b7a714b0b22", + "name": "allowed web origins", + "protocol": "openid-connect", + "protocolMapper": "oidc-allowed-origins-mapper", + "consentRequired": false, + "config": {} + } + ] + }, + { + "id": "4a0abefc-0423-403d-8383-10f989580c13", + "name": "phone", + "description": "OpenID Connect built-in scope: phone", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "true", + "consent.screen.text": "${phoneScopeConsentText}", + "display.on.consent.screen": "true" + }, + "protocolMappers": [ + { + "id": "acdce654-be20-4386-bd4f-edf2cd868f6b", + "name": "phone number", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "phoneNumber", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "phone_number", + "jsonType.label": "String", + "userinfo.token.claim": "true" + } + }, + { + "id": "37082e43-4429-479d-bd80-7b8d11b17769", + "name": "phone number verified", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": false, + "config": { + "user.attribute": "phoneNumberVerified", + "id.token.claim": "true", + "access.token.claim": "true", + "claim.name": "phone_number_verified", + "jsonType.label": "boolean", + "userinfo.token.claim": "true" + } + } + ] + }, + { + "id": "1e5f680b-df5f-4d8c-b9c9-52b5445171ce", + "name": "acr", + "description": "OpenID Connect scope for add acr (authentication context class reference) to the token", + "protocol": "openid-connect", + "attributes": { + "include.in.token.scope": "false", + "display.on.consent.screen": "false" + }, + "protocolMappers": [ + { + "id": "590accb2-1b94-452e-bb20-51bc643fe860", + "name": "acr loa level", + "protocol": "openid-connect", + "protocolMapper": "oidc-acr-mapper", + "consentRequired": false, + "config": { + "id.token.claim": "true", + "access.token.claim": "true", + "userinfo.token.claim": "true" + } + } + ] + } + ], + "defaultDefaultClientScopes": [ + "role_list", + "profile", + "email", + "roles", + "web-origins", + "acr", + "basic" + ], + "defaultOptionalClientScopes": [ + "offline_access", + "address", + "phone", + "microprofile-jwt" + ], + "browserSecurityHeaders": { + "contentSecurityPolicyReportOnly": "", + "xContentTypeOptions": "nosniff", + "referrerPolicy": "no-referrer", + "xRobotsTag": "none", + "xFrameOptions": "SAMEORIGIN", + "contentSecurityPolicy": "frame-src 'self'; frame-ancestors 'self'; object-src 'none';", + "xXSSProtection": "1; mode=block", + "strictTransportSecurity": "max-age=31536000; includeSubDomains" + }, + "smtpServer": {}, + "eventsEnabled": false, + "eventsListeners": [ + "jboss-logging" + ], + "enabledEventTypes": [], + "adminEventsEnabled": false, + "adminEventsDetailsEnabled": false, + "identityProviders": [], + "identityProviderMappers": [], + "components": { + "org.keycloak.services.clientregistration.policy.ClientRegistrationPolicy": [ + { + "id": "76bd801e-c608-4338-8198-668c92446a35", + "name": "Full Scope Disabled", + "providerId": "scope", + "subType": "anonymous", + "subComponents": {}, + "config": {} + }, + { + "id": "06472a8f-7614-4022-b08e-62f023a5fe0a", + "name": "Allowed Client Scopes", + "providerId": "allowed-client-templates", + "subType": "anonymous", + "subComponents": {}, + "config": { + "allow-default-scopes": [ + "true" + ] + } + }, + { + "id": "3667ac91-1abf-4124-91e6-ffc803dc29aa", + "name": "Consent Required", + "providerId": "consent-required", + "subType": "anonymous", + "subComponents": {}, + "config": {} + }, + { + "id": "6e0c8a3f-b5f4-4a49-b44c-bde8ae314d89", + "name": "Max Clients Limit", + "providerId": "max-clients", + "subType": "anonymous", + "subComponents": {}, + "config": { + "max-clients": [ + "200" + ] + } + }, + { + "id": "62d78a88-78a2-4ea7-937b-9a062e946108", + "name": "Trusted Hosts", + "providerId": "trusted-hosts", + "subType": "anonymous", + "subComponents": {}, + "config": { + "host-sending-registration-request-must-match": [ + "true" + ], + "client-uris-must-match": [ + "true" + ] + } + }, + { + "id": "0ca9718d-bfca-4059-b7e8-e32ae3f70a7f", + "name": "Allowed Protocol Mapper Types", + "providerId": "allowed-protocol-mappers", + "subType": "authenticated", + "subComponents": {}, + "config": { + "allowed-protocol-mapper-types": [ + "oidc-address-mapper", + "saml-user-property-mapper", + "oidc-usermodel-attribute-mapper", + "oidc-usermodel-property-mapper", + "oidc-full-name-mapper", + "saml-role-list-mapper", + "saml-user-attribute-mapper", + "oidc-sha256-pairwise-sub-mapper" + ] + } + }, + { + "id": "9247c25c-ce3e-4858-8dda-b2c95b2f4d09", + "name": "Allowed Client Scopes", + "providerId": "allowed-client-templates", + "subType": "authenticated", + "subComponents": {}, + "config": { + "allow-default-scopes": [ + "true" + ] + } + }, + { + "id": "2d3e37a6-c167-4992-abf8-8cbe22f1bcb9", + "name": "Allowed Protocol Mapper Types", + "providerId": "allowed-protocol-mappers", + "subType": "anonymous", + "subComponents": {}, + "config": { + "allowed-protocol-mapper-types": [ + "saml-user-property-mapper", + "oidc-full-name-mapper", + "oidc-address-mapper", + "saml-role-list-mapper", + "oidc-usermodel-attribute-mapper", + "oidc-usermodel-property-mapper", + "oidc-sha256-pairwise-sub-mapper", + "saml-user-attribute-mapper" + ] + } + } + ], + "org.keycloak.userprofile.UserProfileProvider": [ + { + "id": "c12df2b1-cd7d-46b7-ba91-b4381a59f487", + "providerId": "declarative-user-profile", + "subComponents": {}, + "config": { + "kc.user.profile.config": [ + "{\"attributes\":[{\"name\":\"username\",\"displayName\":\"${username}\",\"validations\":{\"length\":{\"min\":3,\"max\":255},\"username-prohibited-characters\":{},\"up-username-not-idn-homograph\":{}},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false},{\"name\":\"email\",\"displayName\":\"${email}\",\"validations\":{\"email\":{},\"length\":{\"max\":255}},\"required\":{\"roles\":[\"user\"]},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false},{\"name\":\"firstName\",\"displayName\":\"${firstName}\",\"validations\":{\"length\":{\"max\":255},\"person-name-prohibited-characters\":{}},\"required\":{\"roles\":[\"user\"]},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false},{\"name\":\"lastName\",\"displayName\":\"${lastName}\",\"validations\":{\"length\":{\"max\":255},\"person-name-prohibited-characters\":{}},\"required\":{\"roles\":[\"user\"]},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false}],\"groups\":[{\"name\":\"user-metadata\",\"displayHeader\":\"User metadata\",\"displayDescription\":\"Attributes, which refer to user metadata\"}],\"unmanagedAttributePolicy\":\"ENABLED\"}" + ] + } + } + ], + "org.keycloak.keys.KeyProvider": [ + { + "id": "29577a17-9e8a-40cf-b804-cf36c2cf567c", + "name": "hmac-generated-hs512", + "providerId": "hmac-generated", + "subComponents": {}, + "config": { + "priority": [ + "100" + ], + "algorithm": [ + "HS512" + ] + } + }, + { + "id": "48051b03-e0a1-413d-af4a-d9c301f12662", + "name": "rsa-enc-generated", + "providerId": "rsa-enc-generated", + "subComponents": {}, + "config": { + "priority": [ + "100" + ], + "algorithm": [ + "RSA-OAEP" + ] + } + }, + { + "id": "04c1d0e1-6889-48d2-833a-449a2a9e6fe1", + "name": "hmac-generated", + "providerId": "hmac-generated", + "subComponents": {}, + "config": { + "priority": [ + "100" + ], + "algorithm": [ + "HS256" + ] + } + }, + { + "id": "500737be-f83b-4e67-954e-9e71ca7ed1b0", + "name": "rsa-generated", + "providerId": "rsa-generated", + "subComponents": {}, + "config": { + "priority": [ + "100" + ] + } + }, + { + "id": "7842aa88-a8fb-49a2-ac10-e437337e236a", + "name": "aes-generated", + "providerId": "aes-generated", + "subComponents": {}, + "config": { + "priority": [ + "100" + ] + } + } + ] + }, + "internationalizationEnabled": false, + "supportedLocales": [], + "authenticationFlows": [ + { + "id": "43f7c655-a9cd-4d53-8161-3b3d2008c126", + "alias": "Account verification options", + "description": "Method with which to verity the existing account", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "idp-email-verification", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "ALTERNATIVE", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "Verify Existing Account by Re-authentication", + "userSetupAllowed": false + } + ] + }, + { + "id": "0f5c2215-5f40-4509-bb6f-f28c9b743388", + "alias": "Browser - Conditional OTP", + "description": "Flow to determine if the OTP is required for the authentication", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "conditional-user-configured", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "auth-otp-form", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "eb66c86a-efdc-4039-9153-cd4708f39ba7", + "alias": "Direct Grant - Conditional OTP", + "description": "Flow to determine if the OTP is required for the authentication", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "conditional-user-configured", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "direct-grant-validate-otp", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "e68e679a-5fc1-427b-93c6-5657f3ff6eb1", + "alias": "First broker login - Conditional OTP", + "description": "Flow to determine if the OTP is required for the authentication", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "conditional-user-configured", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "auth-otp-form", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "e4a832f6-bae3-41c6-8198-5c14c6ddf706", + "alias": "Handle Existing Account", + "description": "Handle what to do if there is existing account with same email/username like authenticated identity provider", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "idp-confirm-link", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "Account verification options", + "userSetupAllowed": false + } + ] + }, + { + "id": "2bbaf432-1058-4ee4-a994-d87f1c224032", + "alias": "Reset - Conditional OTP", + "description": "Flow to determine if the OTP should be reset or not. Set to REQUIRED to force.", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "conditional-user-configured", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "reset-otp", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "352782b8-ddae-4ddc-af19-86a2900ef1f9", + "alias": "User creation or linking", + "description": "Flow for the existing/non-existing user alternatives", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticatorConfig": "create unique user config", + "authenticator": "idp-create-user-if-unique", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "ALTERNATIVE", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "Handle Existing Account", + "userSetupAllowed": false + } + ] + }, + { + "id": "fdc0ecfb-67f8-4390-85a0-50ecfdc66800", + "alias": "Verify Existing Account by Re-authentication", + "description": "Reauthentication of existing account", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "idp-username-password-form", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "CONDITIONAL", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "First broker login - Conditional OTP", + "userSetupAllowed": false + } + ] + }, + { + "id": "a656206c-59b9-47cf-8880-c0f04f04a0c3", + "alias": "browser", + "description": "browser based authentication", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "auth-cookie", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "auth-spnego", + "authenticatorFlow": false, + "requirement": "DISABLED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "identity-provider-redirector", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 25, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "ALTERNATIVE", + "priority": 30, + "autheticatorFlow": true, + "flowAlias": "forms", + "userSetupAllowed": false + } + ] + }, + { + "id": "7616793a-19e4-4d97-b7ae-ab962acaf444", + "alias": "clients", + "description": "Base authentication for clients", + "providerId": "client-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "client-secret", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "client-jwt", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "client-secret-jwt", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 30, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "client-x509", + "authenticatorFlow": false, + "requirement": "ALTERNATIVE", + "priority": 40, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "1f5446d7-d5de-47fb-8e15-347105d3d062", + "alias": "direct grant", + "description": "OpenID Connect Resource Owner Grant", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "direct-grant-validate-username", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "direct-grant-validate-password", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "CONDITIONAL", + "priority": 30, + "autheticatorFlow": true, + "flowAlias": "Direct Grant - Conditional OTP", + "userSetupAllowed": false + } + ] + }, + { + "id": "a55463dd-3ced-4102-a263-c121db059379", + "alias": "docker auth", + "description": "Used by Docker clients to authenticate against the IDP", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "docker-http-basic-authenticator", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "646a12ee-99e7-41cd-a1ea-3ed5e5a96dcf", + "alias": "first broker login", + "description": "Actions taken after first broker login with identity provider account, which is not yet linked to any Keycloak account", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticatorConfig": "review profile config", + "authenticator": "idp-review-profile", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "User creation or linking", + "userSetupAllowed": false + } + ] + }, + { + "id": "03f283e4-7b80-4b38-b90d-33ba8b0a07c3", + "alias": "forms", + "description": "Username, password, otp and other auth forms.", + "providerId": "basic-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "auth-username-password-form", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "CONDITIONAL", + "priority": 20, + "autheticatorFlow": true, + "flowAlias": "Browser - Conditional OTP", + "userSetupAllowed": false + } + ] + }, + { + "id": "047f04f4-b2c9-4aa9-bc38-4ed2c17d3e2c", + "alias": "registration", + "description": "registration flow", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "registration-page-form", + "authenticatorFlow": true, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": true, + "flowAlias": "registration form", + "userSetupAllowed": false + } + ] + }, + { + "id": "51cfacd6-9ee8-4fb2-a3fe-9e00246d9877", + "alias": "registration form", + "description": "registration form", + "providerId": "form-flow", + "topLevel": false, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "registration-user-creation", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "registration-password-action", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 50, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "registration-recaptcha-action", + "authenticatorFlow": false, + "requirement": "DISABLED", + "priority": 60, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + }, + { + "id": "28bb511d-c4ea-4bb8-805c-086eeaf7b239", + "alias": "reset credentials", + "description": "Reset credentials for a user if they forgot their password or something", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "reset-credentials-choose-user", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "reset-credential-email", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 20, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticator": "reset-password", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 30, + "autheticatorFlow": false, + "userSetupAllowed": false + }, + { + "authenticatorFlow": true, + "requirement": "CONDITIONAL", + "priority": 40, + "autheticatorFlow": true, + "flowAlias": "Reset - Conditional OTP", + "userSetupAllowed": false + } + ] + }, + { + "id": "d0189a78-5979-47ce-8536-32c8f6dec1b6", + "alias": "saml ecp", + "description": "SAML ECP Profile Authentication Flow", + "providerId": "basic-flow", + "topLevel": true, + "builtIn": true, + "authenticationExecutions": [ + { + "authenticator": "http-basic-authenticator", + "authenticatorFlow": false, + "requirement": "REQUIRED", + "priority": 10, + "autheticatorFlow": false, + "userSetupAllowed": false + } + ] + } + ], + "authenticatorConfig": [ + { + "id": "09fd7502-4e05-437f-865a-221fa1297e67", + "alias": "create unique user config", + "config": { + "require.password.update.after.registration": "false" + } + }, + { + "id": "9abca294-1e03-418f-841c-18b00053f949", + "alias": "review profile config", + "config": { + "update.profile.on.first.login": "missing" + } + } + ], + "requiredActions": [ + { + "alias": "CONFIGURE_TOTP", + "name": "Configure OTP", + "providerId": "CONFIGURE_TOTP", + "enabled": true, + "defaultAction": false, + "priority": 10, + "config": {} + }, + { + "alias": "TERMS_AND_CONDITIONS", + "name": "Terms and Conditions", + "providerId": "TERMS_AND_CONDITIONS", + "enabled": false, + "defaultAction": false, + "priority": 20, + "config": {} + }, + { + "alias": "UPDATE_PASSWORD", + "name": "Update Password", + "providerId": "UPDATE_PASSWORD", + "enabled": true, + "defaultAction": false, + "priority": 30, + "config": {} + }, + { + "alias": "UPDATE_PROFILE", + "name": "Update Profile", + "providerId": "UPDATE_PROFILE", + "enabled": true, + "defaultAction": false, + "priority": 40, + "config": {} + }, + { + "alias": "VERIFY_EMAIL", + "name": "Verify Email", + "providerId": "VERIFY_EMAIL", + "enabled": true, + "defaultAction": false, + "priority": 50, + "config": {} + }, + { + "alias": "delete_account", + "name": "Delete Account", + "providerId": "delete_account", + "enabled": false, + "defaultAction": false, + "priority": 60, + "config": {} + }, + { + "alias": "webauthn-register", + "name": "Webauthn Register", + "providerId": "webauthn-register", + "enabled": true, + "defaultAction": false, + "priority": 70, + "config": {} + }, + { + "alias": "webauthn-register-passwordless", + "name": "Webauthn Register Passwordless", + "providerId": "webauthn-register-passwordless", + "enabled": true, + "defaultAction": false, + "priority": 80, + "config": {} + }, + { + "alias": "delete_credential", + "name": "Delete Credential", + "providerId": "delete_credential", + "enabled": true, + "defaultAction": false, + "priority": 100, + "config": {} + }, + { + "alias": "update_user_locale", + "name": "Update User Locale", + "providerId": "update_user_locale", + "enabled": true, + "defaultAction": false, + "priority": 1000, + "config": {} + } + ], + "browserFlow": "browser", + "registrationFlow": "registration", + "directGrantFlow": "direct grant", + "resetCredentialsFlow": "reset credentials", + "clientAuthenticationFlow": "clients", + "dockerAuthenticationFlow": "docker auth", + "firstBrokerLoginFlow": "first broker login", + "attributes": { + "cibaBackchannelTokenDeliveryMode": "poll", + "cibaExpiresIn": "120", + "cibaAuthRequestedUserHint": "login_hint", + "oauth2DeviceCodeLifespan": "600", + "clientOfflineSessionMaxLifespan": "0", + "oauth2DevicePollingInterval": "5", + "clientSessionIdleTimeout": "0", + "parRequestUriLifespan": "60", + "clientSessionMaxLifespan": "0", + "clientOfflineSessionIdleTimeout": "0", + "cibaInterval": "5", + "realmReusableOtpCode": "false" + }, + "keycloakVersion": "26.1.1", + "userManagedAccessAllowed": false, + "organizationsEnabled": false, + "verifiableCredentialsEnabled": false, + "adminPermissionsEnabled": false, + "clientProfiles": { + "profiles": [] + }, + "clientPolicies": { + "policies": [] + } +} diff --git a/applications/osmo/deploy/002a-setup/values/gpu-operator.yaml b/applications/osmo/deploy/002a-setup/values/gpu-operator.yaml new file mode 100755 index 000000000..11cc02fdf --- /dev/null +++ b/applications/osmo/deploy/002a-setup/values/gpu-operator.yaml @@ -0,0 +1,57 @@ +# GPU Operator Helm Values +# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator +# https://docs.nebius.com/kubernetes/gpu/set-up + +operator: + defaultRuntime: containerd + +# Enable driver installation by GPU Operator +# Even though Nebius nodes may have pre-installed drivers, the GPU Operator +# needs to manage the driver lifecycle for proper integration with device-plugin, +# toolkit, and other components. +driver: + enabled: true + # Let GPU Operator choose the appropriate driver version + # version: auto-detected by operator + upgradePolicy: + autoUpgrade: false # Don't auto-upgrade to avoid conflicts + +toolkit: + enabled: true + +devicePlugin: + enabled: true + config: + default: "any" + +dcgm: + enabled: true + +dcgmExporter: + enabled: true + serviceMonitor: + enabled: true + +gfd: + enabled: true + +migManager: + enabled: false + +nodeStatusExporter: + enabled: true + +# Node selector for GPU operator pods +node-feature-discovery: + worker: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + +# Tolerations for GPU workloads +daemonsets: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/applications/osmo/deploy/002a-setup/values/grafana.yaml b/applications/osmo/deploy/002a-setup/values/grafana.yaml new file mode 100755 index 000000000..ab8dd6b6b --- /dev/null +++ b/applications/osmo/deploy/002a-setup/values/grafana.yaml @@ -0,0 +1,70 @@ +# Grafana Helm Values (standalone) +# https://github.com/grafana/helm-charts/tree/main/charts/grafana + +# Note: Grafana is typically deployed as part of kube-prometheus-stack +# This file is for standalone Grafana deployment if needed + +replicas: 1 + +adminUser: admin +# adminPassword should be set via --set or secret + +persistence: + enabled: true + size: 10Gi + storageClassName: "" + +resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + +# Datasources +datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + url: http://prometheus-kube-prometheus-prometheus:9090 + access: proxy + isDefault: true + - name: Loki + type: loki + url: http://loki:3100 + access: proxy + +# Dashboard providers +dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default + +# Sidecar for dashboards +sidecar: + dashboards: + enabled: true + label: grafana_dashboard + datasources: + enabled: true + label: grafana_datasource + +# Service +service: + type: ClusterIP + port: 80 + +# Ingress (disabled by default) +ingress: + enabled: false diff --git a/applications/osmo/deploy/002a-setup/values/kai-scheduler.yaml b/applications/osmo/deploy/002a-setup/values/kai-scheduler.yaml new file mode 100755 index 000000000..320c867db --- /dev/null +++ b/applications/osmo/deploy/002a-setup/values/kai-scheduler.yaml @@ -0,0 +1,13 @@ +# KAI Scheduler Helm Values +# GPU-aware scheduler for OSMO +# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/dependencies/dependencies.html + +global: + # Modify the node selectors and tolerations to match your cluster + nodeSelector: {} + tolerations: [] + +scheduler: + additionalArgs: + - --default-staleness-grace-period=-1s # Disable staleness eviction + - --update-pod-eviction-condition=true # Enable OSMO to read preemption conditions diff --git a/applications/osmo/deploy/002a-setup/values/loki.yaml b/applications/osmo/deploy/002a-setup/values/loki.yaml new file mode 100755 index 000000000..f4c277a22 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/values/loki.yaml @@ -0,0 +1,68 @@ +# Loki Stack Helm Values +# https://github.com/grafana/helm-charts/tree/main/charts/loki-stack + +loki: + enabled: true + + persistence: + enabled: true + size: 50Gi + + config: + auth_enabled: false + + server: + http_listen_port: 3100 + + ingester: + lifecycler: + ring: + kvstore: + store: inmemory + replication_factor: 1 + chunk_idle_period: 15m + chunk_retain_period: 30s + + schema_config: + configs: + - from: 2020-01-01 + store: boltdb-shipper + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 24h + + storage_config: + boltdb_shipper: + active_index_directory: /data/loki/boltdb-shipper-active + cache_location: /data/loki/boltdb-shipper-cache + shared_store: filesystem + filesystem: + directory: /data/loki/chunks + + limits_config: + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h + max_entries_limit_per_query: 5000 + + table_manager: + retention_deletes_enabled: true + retention_period: 168h # 7 days + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 1Gi + +# Promtail is deployed separately +promtail: + enabled: false + +# Grafana is deployed via kube-prometheus-stack +grafana: + enabled: false diff --git a/applications/osmo/deploy/002a-setup/values/network-operator.yaml b/applications/osmo/deploy/002a-setup/values/network-operator.yaml new file mode 100755 index 000000000..146a9daca --- /dev/null +++ b/applications/osmo/deploy/002a-setup/values/network-operator.yaml @@ -0,0 +1,62 @@ +# Network Operator Helm Values +# https://docs.nvidia.com/networking/display/cokan10/network+operator + +# Operator settings +operator: + nodeSelector: + node-role.kubernetes.io/control-plane: "" + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + +# RDMA shared device plugin (for InfiniBand) +rdmaSharedDevicePlugin: + deploy: true + resources: + - name: rdma_shared_device_a + vendors: [15b3] + deviceIDs: [101b, 101d, 1017, 1019] + ifNames: ["*"] + +# SR-IOV device plugin +sriovDevicePlugin: + deploy: false + +# NIC cluster policy +nicClusterPolicy: + deploy: true + + # RDMA + rdmaSharedDevicePlugin: + image: k8s-rdma-shared-dev-plugin + repository: ghcr.io/mellanox + version: sha-4f3eb55 + +# Secondary network +secondaryNetwork: + deploy: true + + # Multus CNI + multus: + deploy: true + image: multus-cni + repository: ghcr.io/k8snetworkplumbingwg + version: v3.9.3 + + # CNI plugins + cniPlugins: + deploy: true + image: plugins + repository: ghcr.io/k8snetworkplumbingwg + version: v1.3.0 + + # IPAM plugin + ipamPlugin: + deploy: true + image: whereabouts + repository: ghcr.io/k8snetworkplumbingwg + version: v0.6.2 diff --git a/applications/osmo/deploy/002a-setup/values/osmo-backend-operator.yaml b/applications/osmo/deploy/002a-setup/values/osmo-backend-operator.yaml new file mode 100755 index 000000000..b4781ae21 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/values/osmo-backend-operator.yaml @@ -0,0 +1,37 @@ +# OSMO Backend Operator Values +# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/deploy_backend.html + +global: + # REQUIRED: OSMO image tag (e.g., 6.0.0) + osmoImageTag: "6.0.0" + + # REQUIRED: Your OSMO service URL + serviceUrl: "https://osmo.example.com" + + # Namespaces + agentNamespace: "osmo-operator" + backendNamespace: "osmo-workflows" + + # REQUIRED: Unique name for this backend + backendName: "nebius-backend" + + # Authentication + accountTokenSecret: "osmo-operator-token" + loginMethod: "token" + + # Resource configuration + services: + backendListener: + resources: + requests: + cpu: "1" + memory: "1Gi" + limits: + memory: "1Gi" + backendWorker: + resources: + requests: + cpu: "1" + memory: "1Gi" + limits: + memory: "1Gi" diff --git a/applications/osmo/deploy/002a-setup/values/prometheus.yaml b/applications/osmo/deploy/002a-setup/values/prometheus.yaml new file mode 100755 index 000000000..12cc634d9 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/values/prometheus.yaml @@ -0,0 +1,109 @@ +# Prometheus Stack Helm Values +# https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack + +# Prometheus +prometheus: + prometheusSpec: + # Some CRDs require this to be >= 60 + maximumStartupDurationSeconds: 60 + retention: 15d + + resources: + requests: + cpu: 500m + memory: 2Gi + limits: + cpu: 2000m + memory: 8Gi + + storageSpec: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 50Gi + + # Service monitors + serviceMonitorSelectorNilUsesHelmValues: false + podMonitorSelectorNilUsesHelmValues: false + +# Grafana +grafana: + enabled: true + + adminUser: admin + # adminPassword is set via --set flag + + persistence: + enabled: true + size: 10Gi + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + + # Additional datasources + additionalDataSources: + - name: Loki + type: loki + url: http://loki:3100 + access: proxy + isDefault: false + + # Dashboards + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default + + # GPU dashboard + dashboards: + default: + nvidia-dcgm: + gnetId: 12239 + revision: 2 + datasource: Prometheus + +# Alertmanager +alertmanager: + enabled: true + + alertmanagerSpec: + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + +# Node exporter +nodeExporter: + enabled: true + +# Kube state metrics +kubeStateMetrics: + enabled: true + +# Prometheus operator +prometheusOperator: + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi diff --git a/applications/osmo/deploy/002a-setup/values/promtail.yaml b/applications/osmo/deploy/002a-setup/values/promtail.yaml new file mode 100755 index 000000000..601d29e57 --- /dev/null +++ b/applications/osmo/deploy/002a-setup/values/promtail.yaml @@ -0,0 +1,46 @@ +# Promtail Helm Values +# https://github.com/grafana/helm-charts/tree/main/charts/promtail + +config: + clients: + - url: http://loki:3100/loki/api/v1/push + + snippets: + pipelineStages: + - cri: {} + - json: + expressions: + level: level + message: msg + - labels: + level: + - output: + source: message + +# Resources +resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + +# Tolerations to run on all nodes +tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + +# Volume mounts (for containerd logs if needed) +# Note: The default chart already mounts /var/lib/docker and /var/log +# Only add extra volumes if you need additional paths +extraVolumes: [] +extraVolumeMounts: [] + +# Service monitor +serviceMonitor: + enabled: true From c21da25a52fbc95b66044255fbf9f5f996be00e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Thu, 12 Feb 2026 22:41:52 +0100 Subject: [PATCH 21/37] - fix oauth issues --- .../osmo/deploy/002-setup/04-enable-tls.sh | 20 ++++++++++--------- .../002-setup/05-deploy-osmo-control-plane.sh | 13 ++++++++++++ .../osmo/deploy/002a-setup/04-enable-tls.sh | 20 ++++++++++--------- 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/applications/osmo/deploy/002-setup/04-enable-tls.sh b/applications/osmo/deploy/002-setup/04-enable-tls.sh index 800b7ffb3..cbc6cb7de 100755 --- a/applications/osmo/deploy/002-setup/04-enable-tls.sh +++ b/applications/osmo/deploy/002-setup/04-enable-tls.sh @@ -343,10 +343,11 @@ EOF log_info "Check with: kubectl get certificate ${KC_TLS_SECRET} -n ${OSMO_NS}" fi - # Clean up the bootstrap Ingress if Keycloak will create its own - if [[ "$OSMO_DEPLOYED" == "true" ]]; then - kubectl delete ingress osmo-tls-auth-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null - fi + # Clean up the bootstrap Ingress once the certificate is issued. + # If left in place, the NGINX admission webhook will reject any Helm chart + # (e.g. Keycloak) that tries to create an ingress for the same host+path. + log_info "Removing auth bootstrap ingress (certificate provisioned)..." + kubectl delete ingress osmo-tls-auth-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null fi # ----------------------------------------------------------------------------- @@ -398,12 +399,13 @@ else fi # ----------------------------------------------------------------------------- -# Step 6: Clean up bootstrap Ingress (if OSMO was deployed after cert issued) +# Step 6: Clean up bootstrap Ingress (certificate already provisioned) # ----------------------------------------------------------------------------- -if [[ "$OSMO_DEPLOYED" == "true" ]]; then - # Remove the bootstrap ingress if it exists (from a previous Mode A run) - kubectl delete ingress osmo-tls-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null -fi +# Always remove the bootstrap ingress once certs are issued. If left in place, +# the NGINX admission webhook will reject any Helm chart (e.g. osmo-ui) that +# tries to create an ingress for the same host+path. +log_info "Removing main bootstrap ingress (certificate provisioned)..." +kubectl delete ingress osmo-tls-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null # ----------------------------------------------------------------------------- # Done diff --git a/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh index 65307ce90..2b2f28cd1 100755 --- a/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh +++ b/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh @@ -709,6 +709,12 @@ fi) value: "true" EOF + # Remove the TLS bootstrap ingress for the auth subdomain (if it exists). + # It was created by 04-enable-tls.sh solely to trigger the cert-manager HTTP-01 + # challenge. If left in place, the NGINX admission webhook rejects the Keycloak + # chart's ingress for the same host+path. + kubectl delete ingress osmo-tls-auth-bootstrap -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null || true + # Install or upgrade Keycloak helm upgrade --install keycloak bitnami/keycloak \ --namespace "${OSMO_NAMESPACE}" \ @@ -1432,6 +1438,13 @@ EOF # ----------------------------------------------------------------------------- # Step 6: Deploy OSMO Service # ----------------------------------------------------------------------------- + +# Remove the TLS bootstrap ingress for the main domain (if it exists). +# It was created by 04-enable-tls.sh solely to trigger the cert-manager HTTP-01 +# challenge. If left in place, its catch-all path (/) routes to a placeholder +# service and returns 503 for any path not covered by a more specific ingress. +kubectl delete ingress osmo-tls-bootstrap -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null || true + log_info "Deploying OSMO Service..." SERVICE_HELM_ARGS=( diff --git a/applications/osmo/deploy/002a-setup/04-enable-tls.sh b/applications/osmo/deploy/002a-setup/04-enable-tls.sh index 800b7ffb3..cbc6cb7de 100755 --- a/applications/osmo/deploy/002a-setup/04-enable-tls.sh +++ b/applications/osmo/deploy/002a-setup/04-enable-tls.sh @@ -343,10 +343,11 @@ EOF log_info "Check with: kubectl get certificate ${KC_TLS_SECRET} -n ${OSMO_NS}" fi - # Clean up the bootstrap Ingress if Keycloak will create its own - if [[ "$OSMO_DEPLOYED" == "true" ]]; then - kubectl delete ingress osmo-tls-auth-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null - fi + # Clean up the bootstrap Ingress once the certificate is issued. + # If left in place, the NGINX admission webhook will reject any Helm chart + # (e.g. Keycloak) that tries to create an ingress for the same host+path. + log_info "Removing auth bootstrap ingress (certificate provisioned)..." + kubectl delete ingress osmo-tls-auth-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null fi # ----------------------------------------------------------------------------- @@ -398,12 +399,13 @@ else fi # ----------------------------------------------------------------------------- -# Step 6: Clean up bootstrap Ingress (if OSMO was deployed after cert issued) +# Step 6: Clean up bootstrap Ingress (certificate already provisioned) # ----------------------------------------------------------------------------- -if [[ "$OSMO_DEPLOYED" == "true" ]]; then - # Remove the bootstrap ingress if it exists (from a previous Mode A run) - kubectl delete ingress osmo-tls-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null -fi +# Always remove the bootstrap ingress once certs are issued. If left in place, +# the NGINX admission webhook will reject any Helm chart (e.g. osmo-ui) that +# tries to create an ingress for the same host+path. +log_info "Removing main bootstrap ingress (certificate provisioned)..." +kubectl delete ingress osmo-tls-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null # ----------------------------------------------------------------------------- # Done From a79260fd8d6af48bbb0319d5775f41f4f667e0aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Tue, 17 Feb 2026 11:50:56 +0100 Subject: [PATCH 22/37] - change folder structure - add bucket fixes - add scheduler fixes --- .../04-deploy-osmo-control-plane.sh | 207 +- .../osmo/deploy/002-setup/04-enable-tls.sh | 533 +++- .../05-deploy-osmo-backend.sh | 0 .../002-setup/05-deploy-osmo-control-plane.sh | 2051 ------------- .../06-configure-storage.sh | 0 .../002-setup/06-deploy-osmo-backend.sh | 329 -- .../07-configure-service-url.sh | 0 .../deploy/002-setup/07-configure-storage.sh | 254 -- .../08-configure-gpu-platform.sh | 0 .../002-setup/08-configure-service-url.sh | 154 - .../09-configure-backend-scheduler.sh | 118 + .../002-setup/09-configure-gpu-platform.sh | 167 -- .../002-setup/10-configure-dataset-bucket.sh | 249 ++ applications/osmo/deploy/002-setup/README.md | 38 +- .../osmo/deploy/002-setup/defaults.sh | 34 +- .../deploy/002-setup/gpu_pod_template.json | 55 +- .../osmo/deploy/002-setup/lib/common.sh | 233 +- .../deploy/002-setup/sample_osmo_realm.json | 0 .../002-setup/values/network-operator.yaml | 7 +- .../01-deploy-gpu-infrastructure.sh | 137 - .../002a-setup/02-deploy-observability.sh | 103 - .../002a-setup/03-deploy-nginx-ingress.sh | 89 - .../osmo/deploy/002a-setup/04-enable-tls.sh | 441 --- applications/osmo/deploy/002a-setup/README.md | 363 --- .../cleanup/uninstall-gpu-infrastructure.sh | 43 - .../002a-setup/cleanup/uninstall-keycloak.sh | 62 - .../cleanup/uninstall-nginx-ingress.sh | 20 - .../cleanup/uninstall-observability.sh | 76 - .../cleanup/uninstall-osmo-backend.sh | 63 - .../cleanup/uninstall-osmo-control-plane.sh | 34 - .../osmo/deploy/002a-setup/defaults.sh | 72 - .../002a-setup/gpu_platform_update.json | 14 - .../deploy/002a-setup/gpu_pod_template.json | 16 - .../osmo/deploy/002a-setup/lib/common.sh | 434 --- .../deploy/002a-setup/osmo-values-noauth.yaml | 170 -- .../deploy/002a-setup/sample_osmo_realm.json | 2636 ----------------- .../002a-setup/values/gpu-operator.yaml | 57 - .../deploy/002a-setup/values/grafana.yaml | 70 - .../002a-setup/values/kai-scheduler.yaml | 13 - .../osmo/deploy/002a-setup/values/loki.yaml | 68 - .../002a-setup/values/network-operator.yaml | 62 - .../values/osmo-backend-operator.yaml | 37 - .../deploy/002a-setup/values/prometheus.yaml | 109 - .../deploy/002a-setup/values/promtail.yaml | 46 - .../workflows/osmo/test_bucket_write.yaml | 44 + 45 files changed, 1183 insertions(+), 8525 deletions(-) rename applications/osmo/deploy/{002a-setup => 002-setup}/04-deploy-osmo-control-plane.sh (89%) rename applications/osmo/deploy/{002a-setup => 002-setup}/05-deploy-osmo-backend.sh (100%) delete mode 100755 applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh rename applications/osmo/deploy/{002a-setup => 002-setup}/06-configure-storage.sh (100%) delete mode 100755 applications/osmo/deploy/002-setup/06-deploy-osmo-backend.sh rename applications/osmo/deploy/{002a-setup => 002-setup}/07-configure-service-url.sh (100%) delete mode 100755 applications/osmo/deploy/002-setup/07-configure-storage.sh rename applications/osmo/deploy/{002a-setup => 002-setup}/08-configure-gpu-platform.sh (100%) delete mode 100755 applications/osmo/deploy/002-setup/08-configure-service-url.sh create mode 100755 applications/osmo/deploy/002-setup/09-configure-backend-scheduler.sh delete mode 100755 applications/osmo/deploy/002-setup/09-configure-gpu-platform.sh create mode 100755 applications/osmo/deploy/002-setup/10-configure-dataset-bucket.sh mode change 100644 => 100755 applications/osmo/deploy/002-setup/sample_osmo_realm.json delete mode 100755 applications/osmo/deploy/002a-setup/01-deploy-gpu-infrastructure.sh delete mode 100755 applications/osmo/deploy/002a-setup/02-deploy-observability.sh delete mode 100755 applications/osmo/deploy/002a-setup/03-deploy-nginx-ingress.sh delete mode 100755 applications/osmo/deploy/002a-setup/04-enable-tls.sh delete mode 100755 applications/osmo/deploy/002a-setup/README.md delete mode 100755 applications/osmo/deploy/002a-setup/cleanup/uninstall-gpu-infrastructure.sh delete mode 100755 applications/osmo/deploy/002a-setup/cleanup/uninstall-keycloak.sh delete mode 100755 applications/osmo/deploy/002a-setup/cleanup/uninstall-nginx-ingress.sh delete mode 100755 applications/osmo/deploy/002a-setup/cleanup/uninstall-observability.sh delete mode 100755 applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-backend.sh delete mode 100755 applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-control-plane.sh delete mode 100755 applications/osmo/deploy/002a-setup/defaults.sh delete mode 100755 applications/osmo/deploy/002a-setup/gpu_platform_update.json delete mode 100755 applications/osmo/deploy/002a-setup/gpu_pod_template.json delete mode 100755 applications/osmo/deploy/002a-setup/lib/common.sh delete mode 100755 applications/osmo/deploy/002a-setup/osmo-values-noauth.yaml delete mode 100755 applications/osmo/deploy/002a-setup/sample_osmo_realm.json delete mode 100755 applications/osmo/deploy/002a-setup/values/gpu-operator.yaml delete mode 100755 applications/osmo/deploy/002a-setup/values/grafana.yaml delete mode 100755 applications/osmo/deploy/002a-setup/values/kai-scheduler.yaml delete mode 100755 applications/osmo/deploy/002a-setup/values/loki.yaml delete mode 100755 applications/osmo/deploy/002a-setup/values/network-operator.yaml delete mode 100755 applications/osmo/deploy/002a-setup/values/osmo-backend-operator.yaml delete mode 100755 applications/osmo/deploy/002a-setup/values/prometheus.yaml delete mode 100755 applications/osmo/deploy/002a-setup/values/promtail.yaml create mode 100755 applications/osmo/workflows/osmo/test_bucket_write.yaml diff --git a/applications/osmo/deploy/002a-setup/04-deploy-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/04-deploy-osmo-control-plane.sh similarity index 89% rename from applications/osmo/deploy/002a-setup/04-deploy-osmo-control-plane.sh rename to applications/osmo/deploy/002-setup/04-deploy-osmo-control-plane.sh index a370f65c6..4ea8c6e71 100755 --- a/applications/osmo/deploy/002a-setup/04-deploy-osmo-control-plane.sh +++ b/applications/osmo/deploy/002-setup/04-deploy-osmo-control-plane.sh @@ -8,7 +8,7 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" @@ -22,6 +22,23 @@ echo "" check_kubectl || exit 1 check_helm || exit 1 +# ----------------------------------------------------------------------------- +# Validate hostname requirements (early, before any deployment work) +# ----------------------------------------------------------------------------- +if [[ "${OSMO_TLS_ENABLED:-false}" == "true" && -z "${OSMO_INGRESS_HOSTNAME:-}" ]]; then + log_error "OSMO_TLS_ENABLED=true requires OSMO_INGRESS_HOSTNAME to be set." + echo " TLS certificates are issued for a domain name, not a bare IP." + echo " Set your domain: export OSMO_INGRESS_HOSTNAME=osmo.example.com" + exit 1 +fi + +if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" && -z "${OSMO_INGRESS_HOSTNAME:-}" && -z "${KEYCLOAK_HOSTNAME:-}" ]]; then + log_error "DEPLOY_KEYCLOAK=true requires OSMO_INGRESS_HOSTNAME or KEYCLOAK_HOSTNAME to be set." + echo " KEYCLOAK_HOSTNAME is auto-derived as auth- if not set explicitly." + echo " Set your domain: export OSMO_INGRESS_HOSTNAME=osmo.example.com" + exit 1 +fi + # ----------------------------------------------------------------------------- # Configuration # ----------------------------------------------------------------------------- @@ -92,13 +109,56 @@ log_info "Using Nebius Managed PostgreSQL..." log_success "Database: ${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}" +# ----------------------------------------------------------------------------- +# Select Nebius Region +# ----------------------------------------------------------------------------- +VALID_REGIONS=("eu-north1" "me-west1") + +if [[ -n "${NEBIUS_REGION:-}" ]]; then + NEBIUS_SELECTED_REGION="$NEBIUS_REGION" + matched=false + for r in "${VALID_REGIONS[@]}"; do + [[ "$r" == "$NEBIUS_SELECTED_REGION" ]] && matched=true && break + done + if ! $matched; then + log_error "Invalid NEBIUS_REGION '${NEBIUS_SELECTED_REGION}'. Valid options: ${VALID_REGIONS[*]}" + exit 1 + fi + log_info "Using region from NEBIUS_REGION: ${NEBIUS_SELECTED_REGION}" +else + echo "Select the Nebius region for storage:" + echo "" + _idx=1 + for _r in "${VALID_REGIONS[@]}"; do + echo " ${_idx}) ${_r}" + _idx=$((_idx + 1)) + done + echo "" + while true; do + printf "Enter choice [1-${#VALID_REGIONS[@]}]: " + read -r choice + if [[ "$choice" =~ ^[0-9]+$ ]] && (( choice >= 1 && choice <= ${#VALID_REGIONS[@]} )); then + NEBIUS_SELECTED_REGION="${VALID_REGIONS[$choice]}" + # bash arrays are 0-based, zsh arrays are 1-based; adjust if needed + if [[ -z "$NEBIUS_SELECTED_REGION" ]]; then + NEBIUS_SELECTED_REGION="${VALID_REGIONS[$((choice - 1))]}" + fi + break + fi + echo "Invalid selection. Please enter a number between 1 and ${#VALID_REGIONS[@]}." + done + log_info "Selected region: ${NEBIUS_SELECTED_REGION}" +fi + +S3_NEBIUS_ENDPOINT="https://storage.${NEBIUS_SELECTED_REGION}.nebius.cloud" + # ----------------------------------------------------------------------------- # Get Storage Configuration # ----------------------------------------------------------------------------- log_info "Retrieving storage configuration..." S3_BUCKET=$(get_tf_output "storage_bucket.name" "../001-iac" || echo "") -S3_ENDPOINT=$(get_tf_output "storage_bucket.endpoint" "../001-iac" || echo "https://storage.eu-north1.nebius.cloud") +S3_ENDPOINT=$(get_tf_output "storage_bucket.endpoint" "../001-iac" || echo "${S3_NEBIUS_ENDPOINT}") S3_ACCESS_KEY=$(get_tf_output "storage_credentials.access_key_id" "../001-iac" || echo "") # Secret access key is stored in MysteryBox (ephemeral, not in Terraform state) @@ -181,7 +241,7 @@ log_info "Verifying PostgreSQL connection..." break fi sleep 2 - ((test_elapsed += 2)) + test_elapsed=$((test_elapsed + 2)) done # Check test result @@ -307,7 +367,7 @@ EOF fi sleep 2 - ((elapsed += 2)) + elapsed=$((elapsed + 2)) done if [[ "$pod_status" != "Succeeded" ]]; then @@ -513,9 +573,31 @@ if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then KC_EXTERNAL="true" log_info "Keycloak will be exposed externally at: https://${AUTH_DOMAIN}" else - log_warning "TLS secret '${KC_TLS_SECRET}' for Keycloak not found." - log_warning "Run: OSMO_INGRESS_HOSTNAME=${AUTH_DOMAIN} OSMO_TLS_SECRET_NAME=${KC_TLS_SECRET} ./03a-setup-tls-certificate.sh" - log_warning "Keycloak will be internal-only (port-forward access)" + # Auto-recover: if local cert files exist for the auth domain, recreate the secret + KC_CERT_DIR="${OSMO_TLS_CERT_DIR:-$HOME/.osmo-certs}" + KC_LOCAL_CERT="${KC_CERT_DIR}/live/${AUTH_DOMAIN}/fullchain.pem" + KC_LOCAL_KEY="${KC_CERT_DIR}/live/${AUTH_DOMAIN}/privkey.pem" + if [[ -f "$KC_LOCAL_CERT" && -f "$KC_LOCAL_KEY" ]]; then + log_warning "TLS secret '${KC_TLS_SECRET}' for Keycloak not found, but local certs exist." + log_info "Auto-recovering: recreating secret from ${KC_CERT_DIR}/live/${AUTH_DOMAIN}/..." + kubectl create secret tls "${KC_TLS_SECRET}" \ + --cert="${KC_LOCAL_CERT}" \ + --key="${KC_LOCAL_KEY}" \ + --namespace "${OSMO_NAMESPACE}" \ + --dry-run=client -o yaml | kubectl apply -f - + kubectl create secret tls "${KC_TLS_SECRET}" \ + --cert="${KC_LOCAL_CERT}" \ + --key="${KC_LOCAL_KEY}" \ + --namespace "${INGRESS_NAMESPACE:-ingress-nginx}" \ + --dry-run=client -o yaml | kubectl apply -f - + log_success "TLS secret '${KC_TLS_SECRET}' recreated from local cert files" + KC_EXTERNAL="true" + log_info "Keycloak will be exposed externally at: https://${AUTH_DOMAIN}" + else + log_warning "TLS secret '${KC_TLS_SECRET}' for Keycloak not found." + log_warning "Run: OSMO_INGRESS_HOSTNAME=${AUTH_DOMAIN} ./03a-setup-tls-certificate.sh" + log_warning "Keycloak will be internal-only (port-forward access)" + fi fi fi @@ -672,15 +754,10 @@ fi) value: "true" EOF - # Remove the TLS bootstrap ingress for the auth subdomain (if it exists). - # It was created by 04-enable-tls.sh solely to trigger the cert-manager HTTP-01 - # challenge. If left in place, the NGINX admission webhook rejects the Keycloak - # chart's ingress for the same host+path. - kubectl delete ingress osmo-tls-auth-bootstrap -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null || true - # Install or upgrade Keycloak # Note: Don't use --wait as it can hang; we'll check status separately helm upgrade --install keycloak bitnami/keycloak \ + --version 24.4.9 \ --namespace "${OSMO_NAMESPACE}" \ -f /tmp/keycloak-values.yaml \ --timeout 10m || { @@ -770,27 +847,27 @@ spec: - | set -e KEYCLOAK_URL="http://keycloak:80" - + echo "============================================" echo " OSMO Keycloak Realm Import" echo "============================================" echo "" - + # ── Step 1: Prepare realm JSON ────────────────────────── echo "=== Step 1: Prepare realm JSON ===" echo "Customising sample_osmo_realm.json for this deployment..." cp /data/realm.json /tmp/realm-import.json - + # Replace placeholder URLs (https://default.com) with actual OSMO URL sed -i "s|https://default.com|${OSMO_BASE_URL}|g" /tmp/realm-import.json - + # Replace masked client secret with generated secret sed -i 's/"secret": "[*][*]*"/"secret": "${OIDC_CLIENT_SECRET}"/' /tmp/realm-import.json - + echo " OSMO URL: ${OSMO_BASE_URL}" echo " Realm JSON: \$(wc -c < /tmp/realm-import.json) bytes" echo "" - + # ── Step 2: Wait for Keycloak ─────────────────────────── echo "=== Step 2: Wait for Keycloak ===" for i in 1 2 3 4 5 6 7 8 9 10 11 12; do @@ -869,6 +946,66 @@ spec: echo "Realm 'osmo' verified" echo "" + # ── Step 4b: Set client secret for osmo-browser-flow ─── + # Keycloak ignores the "secret" field during realm import and + # generates its own random secret. We MUST explicitly set it via the + # admin API so it matches the oidc-secrets Kubernetes secret that + # Envoy reads at runtime. + echo "=== Step 4b: Set osmo-browser-flow client secret ===" + + # Refresh token (import may have been slow) + TOKEN=\$(curl -s -X POST "\${KEYCLOAK_URL}/realms/master/protocol/openid-connect/token" \ + --data-urlencode "client_id=admin-cli" \ + --data-urlencode "username=admin" \ + --data-urlencode "password=${KEYCLOAK_ADMIN_PASSWORD}" \ + --data-urlencode "grant_type=password" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4) + + # Find the internal UUID for the osmo-browser-flow client + BROWSER_CLIENT_UUID=\$(curl -s "\${KEYCLOAK_URL}/admin/realms/osmo/clients?clientId=osmo-browser-flow" \ + -H "Authorization: Bearer \$TOKEN" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4) + + if [ -n "\$BROWSER_CLIENT_UUID" ]; then + echo " Client UUID: \$BROWSER_CLIENT_UUID" + + # GET the full client representation, replace ONLY the secret field, PUT it back. + # This preserves redirect URIs, scopes, mappers, and all other config. + curl -s "\${KEYCLOAK_URL}/admin/realms/osmo/clients/\${BROWSER_CLIENT_UUID}" \ + -H "Authorization: Bearer \$TOKEN" > /tmp/browser-client.json + + # Replace the masked secret with our generated secret + # Handle both compact ("secret":"...") and spaced ("secret" : "...") JSON + sed -i 's/"secret"[ ]*:[ ]*"[^"]*"/"secret":"${OIDC_CLIENT_SECRET}"/' /tmp/browser-client.json + + SET_SECRET_HTTP=\$(curl -s -o /dev/null -w "%{http_code}" \ + -X PUT "\${KEYCLOAK_URL}/admin/realms/osmo/clients/\${BROWSER_CLIENT_UUID}" \ + -H "Authorization: Bearer \$TOKEN" \ + -H "Content-Type: application/json" \ + -d @/tmp/browser-client.json) + + if [ "\$SET_SECRET_HTTP" = "204" ] || [ "\$SET_SECRET_HTTP" = "200" ]; then + echo " Client secret set successfully (HTTP \$SET_SECRET_HTTP)" + else + echo " WARNING: Failed to set client secret (HTTP \$SET_SECRET_HTTP)" + echo " OAuth browser flow may fail – check Keycloak logs" + fi + + # Verify: read back the secret and compare + ACTUAL_SECRET=\$(curl -s "\${KEYCLOAK_URL}/admin/realms/osmo/clients/\${BROWSER_CLIENT_UUID}/client-secret" \ + -H "Authorization: Bearer \$TOKEN" | grep -o '"value":"[^"]*"' | cut -d'"' -f4) + if [ "\$ACTUAL_SECRET" = "${OIDC_CLIENT_SECRET}" ]; then + echo " Verified: client secret matches oidc-secrets" + else + echo " WARNING: Client secret mismatch!" + echo " Expected: ${OIDC_CLIENT_SECRET:0:8}..." + echo " Got: \${ACTUAL_SECRET:0:8}..." + echo " This will cause 'OAuth flow failed' errors" + fi + else + echo " WARNING: osmo-browser-flow client not found after import" + echo " OAuth browser flow will not work" + fi + echo "" + # ── Step 5: Create test user ──────────────────────────── echo "=== Step 5: Create test user ===" @@ -1034,18 +1171,13 @@ if [[ "$TLS_ENABLED" == "true" ]]; then # Check that the TLS secret exists (created by 03a or 03c) OSMO_NS_CHECK="${OSMO_NAMESPACE:-osmo}" INGRESS_NS_CHECK="${INGRESS_NAMESPACE:-ingress-nginx}" + CERT_DIR="${OSMO_TLS_CERT_DIR:-$HOME/.osmo-certs}" TLS_SECRET_FOUND="false" if kubectl get secret "${TLS_SECRET_NAME}" -n "${OSMO_NS_CHECK}" &>/dev/null || \ kubectl get secret "${TLS_SECRET_NAME}" -n "${INGRESS_NS_CHECK}" &>/dev/null; then TLS_SECRET_FOUND="true" fi - if [[ "$TLS_SECRET_FOUND" != "true" ]]; then - log_error "TLS secret '${TLS_SECRET_NAME}' not found." - echo " Run one of these scripts first to obtain a certificate:" - echo " ./03a-setup-tls-certificate.sh (manual certbot with DNS-01)" - echo " ./03c-deploy-cert-manager.sh (automated cert-manager with HTTP-01)" - exit 1 - fi + log_success "TLS secret '${TLS_SECRET_NAME}' found" else log_info "TLS is disabled (HTTP only). Set OSMO_TLS_ENABLED=true to enable." @@ -1145,6 +1277,15 @@ fi) # Disable built-in OTEL metrics exporter (no collector at localhost:12345) - name: METRICS_OTEL_ENABLE value: "false" + # S3-compatible storage endpoint (Nebius Object Storage) + - name: AWS_ENDPOINT_URL_S3 + value: ${S3_NEBIUS_ENDPOINT}:443 + - name: AWS_S3_FORCE_PATH_STYLE + value: "true" + - name: AWS_DEFAULT_REGION + value: ${NEBIUS_SELECTED_REGION} + - name: OSMO_SKIP_DATA_AUTH + value: "1" # MEK volume mount extraVolumes: - name: vault-secrets @@ -1177,6 +1318,13 @@ fi) # Disable built-in OTEL metrics exporter (no collector at localhost:12345) - name: METRICS_OTEL_ENABLE value: "false" + # S3-compatible storage endpoint (Nebius Object Storage) + - name: AWS_ENDPOINT_URL_S3 + value: ${S3_NEBIUS_ENDPOINT}:443 + - name: AWS_S3_FORCE_PATH_STYLE + value: "true" + - name: AWS_DEFAULT_REGION + value: ${NEBIUS_SELECTED_REGION} extraVolumes: - name: vault-secrets secret: @@ -1360,13 +1508,6 @@ EOF # ----------------------------------------------------------------------------- # Step 6: Deploy OSMO Service # ----------------------------------------------------------------------------- - -# Remove the TLS bootstrap ingress for the main domain (if it exists). -# It was created by 04-enable-tls.sh solely to trigger the cert-manager HTTP-01 -# challenge. If left in place, its catch-all path (/) routes to a placeholder -# service and returns 503 for any path not covered by a more specific ingress. -kubectl delete ingress osmo-tls-bootstrap -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null || true - log_info "Deploying OSMO Service..." SERVICE_HELM_ARGS=( diff --git a/applications/osmo/deploy/002-setup/04-enable-tls.sh b/applications/osmo/deploy/002-setup/04-enable-tls.sh index cbc6cb7de..799811d0f 100755 --- a/applications/osmo/deploy/002-setup/04-enable-tls.sh +++ b/applications/osmo/deploy/002-setup/04-enable-tls.sh @@ -1,44 +1,50 @@ #!/bin/bash # -# Enable TLS/HTTPS using cert-manager + Let's Encrypt +# Enable TLS/HTTPS for OSMO using Let's Encrypt +# +# Supports two certificate methods: +# 1) cert-manager (default) — automated HTTP-01 challenges via in-cluster cert-manager +# 2) certbot — interactive manual DNS-01 challenges via local certbot binary +# +# Set OSMO_TLS_MODE=certbot or OSMO_TLS_MODE=cert-manager to skip the prompt. # # Can be run at two points in the deployment flow: # # A) Right after 03-deploy-nginx-ingress.sh (RECOMMENDED): -# Installs cert-manager, issues the TLS certificate early. -# When 05-deploy-osmo-control-plane.sh runs later, it auto-detects the -# certificate and creates TLS-enabled Ingress resources from the start. +# Issues the TLS certificate early. When 04-deploy-osmo-control-plane.sh +# runs later, it auto-detects the certificate and creates TLS-enabled Ingress. # -# B) After 05-deploy-osmo-control-plane.sh (retrofit existing deployment): +# B) After 04-deploy-osmo-control-plane.sh (retrofit existing deployment): # Does everything in (A) plus patches existing OSMO Ingress resources # and updates service_base_url to HTTPS. # # Prerequisites: # 1. NGINX Ingress Controller deployed (03-deploy-nginx-ingress.sh) -# 2. A DNS A record pointing your domain to the LoadBalancer IP +# 2. A DNS record pointing your domain to the LoadBalancer IP +# (A record for cert-manager/HTTP-01; TXT record for certbot/DNS-01) # # Usage: -# ./04-enable-tls.sh -# -# Example: -# ./04-enable-tls.sh vl51.eu-north1.osmo.nebius.cloud +# ./04-enable-tls.sh [hostname] # # Optional environment variables: -# OSMO_TLS_EMAIL - Email for Let's Encrypt expiry notices (default: noreply@) +# OSMO_TLS_MODE - "cert-manager" or "certbot" (skips prompt) +# OSMO_TLS_EMAIL - Email for Let's Encrypt (default: noreply@) # OSMO_TLS_SECRET_NAME - K8s Secret name for certificate (default: osmo-tls) +# LETSENCRYPT_EMAIL - Alias for OSMO_TLS_EMAIL (certbot path) # set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" -HOSTNAME="${1:-${OSMO_INGRESS_HOSTNAME:-}}" -HOSTNAME="${HOSTNAME%.}" # Strip trailing dot (FQDN notation) +MAIN_HOSTNAME="${1:-${OSMO_INGRESS_HOSTNAME:-}}" +MAIN_HOSTNAME="${MAIN_HOSTNAME%.}" # Strip trailing dot (FQDN notation) TLS_SECRET="${OSMO_TLS_SECRET_NAME:-osmo-tls}" OSMO_NS="${OSMO_NAMESPACE:-osmo}" INGRESS_NS="${INGRESS_NAMESPACE:-ingress-nginx}" +CERT_DIR="${OSMO_TLS_CERT_DIR:-$HOME/.osmo-certs}" echo "" echo "========================================" @@ -49,10 +55,11 @@ echo "" # ----------------------------------------------------------------------------- # Validate inputs # ----------------------------------------------------------------------------- -if [[ -z "$HOSTNAME" ]]; then - log_error "Usage: $0 " +if [[ -z "$MAIN_HOSTNAME" ]]; then + log_error "Hostname is required." echo "" - echo "Example: $0 vl51.eu-north1.osmo.nebius.cloud" + echo "Usage: $0 " + echo " or: export OSMO_INGRESS_HOSTNAME=osmo.example.com" echo "" LB_IP=$(kubectl get svc -n "${INGRESS_NS}" ingress-nginx-controller \ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) @@ -64,20 +71,18 @@ if [[ -z "$HOSTNAME" ]]; then fi check_kubectl || exit 1 -check_helm || exit 1 -log_info "Hostname: ${HOSTNAME}" +log_info "Hostname: ${MAIN_HOSTNAME}" log_info "TLS secret: ${TLS_SECRET}" # Keycloak auth subdomain support -DEPLOY_KEYCLOAK="${DEPLOY_KEYCLOAK:-false}" KC_TLS_SECRET="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" AUTH_HOSTNAME="" -if [[ "$DEPLOY_KEYCLOAK" == "true" ]]; then +if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then if [[ -n "${KEYCLOAK_HOSTNAME:-}" ]]; then AUTH_HOSTNAME="${KEYCLOAK_HOSTNAME}" else - AUTH_HOSTNAME="auth.${HOSTNAME}" + AUTH_HOSTNAME="auth.${MAIN_HOSTNAME}" fi log_info "Keycloak auth hostname: ${AUTH_HOSTNAME}" log_info "Keycloak TLS secret: ${KC_TLS_SECRET}" @@ -87,7 +92,33 @@ fi LB_IP=$(kubectl get svc -n "${INGRESS_NS}" ingress-nginx-controller \ -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) -# Prompt user to set up DNS records before proceeding +# ----------------------------------------------------------------------------- +# Select TLS method +# ----------------------------------------------------------------------------- +TLS_MODE="${OSMO_TLS_MODE:-}" +if [[ -z "$TLS_MODE" ]]; then + echo "" + echo "Select TLS certificate method:" + echo "" + echo " 1) cert-manager — automated HTTP-01 challenges (requires DNS A record)" + echo " 2) certbot — interactive DNS-01 challenges (requires DNS TXT record)" + echo "" + while true; do + printf "Enter choice [1-2] (default: 1): " + read -r _tls_choice + case "${_tls_choice:-1}" in + 1) TLS_MODE="cert-manager"; break ;; + 2) TLS_MODE="certbot"; break ;; + *) echo "Invalid selection." ;; + esac + done +fi + +log_info "TLS method: ${TLS_MODE}" + +# ----------------------------------------------------------------------------- +# DNS info +# ----------------------------------------------------------------------------- echo "" echo "========================================" echo " DNS Record Setup Required" @@ -96,7 +127,7 @@ echo "" if [[ -n "$LB_IP" ]]; then echo "Create the following DNS A record(s) pointing to your LoadBalancer IP:" echo "" - echo " ${HOSTNAME} -> ${LB_IP}" + echo " ${MAIN_HOSTNAME} -> ${LB_IP}" if [[ -n "$AUTH_HOSTNAME" ]]; then echo " ${AUTH_HOSTNAME} -> ${LB_IP}" fi @@ -105,29 +136,33 @@ else echo " kubectl get svc -n ${INGRESS_NS} ingress-nginx-controller" echo "" echo "Once the IP is available, create DNS A record(s) for:" - echo " ${HOSTNAME}" + echo " ${MAIN_HOSTNAME}" if [[ -n "$AUTH_HOSTNAME" ]]; then echo " ${AUTH_HOSTNAME}" fi fi echo "" -echo "Let's Encrypt HTTP-01 challenges require DNS to resolve to the LoadBalancer." +if [[ "$TLS_MODE" == "certbot" ]]; then + echo "Certbot DNS-01 challenges require you to create TXT records when prompted." +else + echo "Let's Encrypt HTTP-01 challenges require DNS to resolve to the LoadBalancer." +fi echo "" read_prompt_var "Press Enter once DNS records are configured (or type 'skip' to skip DNS check)" DNS_CONFIRM "" # Verify DNS resolves to the LoadBalancer IP if [[ "$DNS_CONFIRM" != "skip" ]]; then - DNS_IP=$(dig +short "$HOSTNAME" 2>/dev/null | tail -1 || true) + DNS_IP=$(dig +short "$MAIN_HOSTNAME" 2>/dev/null | tail -1 || true) if [[ -n "$LB_IP" && -n "$DNS_IP" ]]; then if [[ "$DNS_IP" == "$LB_IP" ]]; then - log_success "DNS check: ${HOSTNAME} -> ${DNS_IP} (matches LoadBalancer)" + log_success "DNS check: ${MAIN_HOSTNAME} -> ${DNS_IP} (matches LoadBalancer)" else - log_warning "DNS mismatch: ${HOSTNAME} -> ${DNS_IP}, but LoadBalancer IP is ${LB_IP}" - log_warning "Let's Encrypt HTTP-01 challenge may fail if DNS doesn't point to the LoadBalancer." + log_warning "DNS mismatch: ${MAIN_HOSTNAME} -> ${DNS_IP}, but LoadBalancer IP is ${LB_IP}" + log_warning "Let's Encrypt challenge may fail if DNS doesn't point to the LoadBalancer." fi elif [[ -z "$DNS_IP" ]]; then - log_warning "Could not resolve ${HOSTNAME}. Make sure the DNS record exists." + log_warning "Could not resolve ${MAIN_HOSTNAME}. Make sure the DNS record exists." fi if [[ -n "$AUTH_HOSTNAME" ]]; then @@ -150,35 +185,219 @@ if [[ "$INGRESS_COUNT" -gt 0 ]]; then log_info "Found ${INGRESS_COUNT} Ingress resource(s) in ${OSMO_NS} (will patch with TLS)" OSMO_DEPLOYED="true" else - log_info "No OSMO Ingress resources yet — preparing cert-manager and certificate" - log_info "Step 05 will auto-detect the TLS cert and create HTTPS Ingress" + log_info "No OSMO Ingress resources yet — preparing certificate" + log_info "04-deploy-osmo-control-plane.sh will auto-detect the TLS cert" OSMO_DEPLOYED="false" fi -# ----------------------------------------------------------------------------- -# Step 1: Install cert-manager -# ----------------------------------------------------------------------------- -log_info "Installing cert-manager..." -helm repo add jetstack https://charts.jetstack.io --force-update -helm repo update jetstack +# Ensure the OSMO namespace exists +kubectl create namespace "${OSMO_NS}" --dry-run=client -o yaml | kubectl apply -f - + +# ============================================================================= +# Helper: create K8s TLS secret in both namespaces from cert/key files +# ============================================================================= +create_tls_secret_from_files() { + local secret_name="$1" + local cert_path="$2" + local key_path="$3" + + log_info "Creating TLS secret '${secret_name}' in namespace '${INGRESS_NS}'..." + kubectl create secret tls "${secret_name}" \ + --cert="${cert_path}" \ + --key="${key_path}" \ + --namespace "${INGRESS_NS}" \ + --dry-run=client -o yaml | kubectl apply -f - + + if [[ "$OSMO_NS" != "$INGRESS_NS" ]]; then + log_info "Creating TLS secret '${secret_name}' in namespace '${OSMO_NS}'..." + kubectl create secret tls "${secret_name}" \ + --cert="${cert_path}" \ + --key="${key_path}" \ + --namespace "${OSMO_NS}" \ + --dry-run=client -o yaml | kubectl apply -f - + fi + log_success "TLS secret '${secret_name}' created" +} + +# ============================================================================= +# Helper: copy cert-manager secret to the other namespace if needed +# ============================================================================= +copy_secret_across_namespaces() { + local secret_name="$1" + if [[ "$OSMO_NS" != "$INGRESS_NS" ]]; then + # cert-manager creates the secret in the Certificate's namespace (OSMO_NS). + # Copy it to the ingress namespace so both can reference it. + if kubectl get secret "${secret_name}" -n "${OSMO_NS}" &>/dev/null; then + if ! kubectl get secret "${secret_name}" -n "${INGRESS_NS}" &>/dev/null; then + log_info "Copying secret '${secret_name}' to namespace '${INGRESS_NS}'..." + kubectl get secret "${secret_name}" -n "${OSMO_NS}" -o json \ + | jq 'del(.metadata.namespace,.metadata.resourceVersion,.metadata.uid,.metadata.creationTimestamp)' \ + | kubectl apply -n "${INGRESS_NS}" -f - + fi + fi + fi +} -if helm status cert-manager -n cert-manager &>/dev/null; then - log_info "cert-manager already installed" +# ############################################################################# +# CERTBOT PATH +# ############################################################################# +if [[ "$TLS_MODE" == "certbot" ]]; then + + # Check certbot + if ! command -v certbot &>/dev/null; then + log_error "certbot is not installed." + echo "" + echo "Install certbot using one of these methods:" + echo " Ubuntu/Debian: sudo apt install certbot" + echo " macOS: brew install certbot" + echo " pip: pip install certbot" + echo " snap: sudo snap install certbot --classic" + echo "" + exit 1 + fi + log_success "certbot found: $(certbot --version 2>&1 | head -1)" + + TLS_EMAIL="${LETSENCRYPT_EMAIL:-${OSMO_TLS_EMAIL:-}}" + if [[ -z "$TLS_EMAIL" ]]; then + echo "Enter your email for Let's Encrypt registration:" + printf " Email: " + read -r TLS_EMAIL + echo "" + if [[ -z "$TLS_EMAIL" ]]; then + log_error "Email is required for certbot." + exit 1 + fi + fi + + # Build list of domains to process: "domain:secret_name" + DOMAINS_TO_PROCESS=("${MAIN_HOSTNAME}:${TLS_SECRET}") + if [[ -n "$AUTH_HOSTNAME" ]]; then + DOMAINS_TO_PROCESS+=("${AUTH_HOSTNAME}:${KC_TLS_SECRET}") + fi + + # Show plan + echo "" + echo "========================================" + echo " Certificate Plan (certbot DNS-01)" + echo "========================================" + echo "" + echo " Email: ${TLS_EMAIL}" + echo " Cert directory: ${CERT_DIR}" + echo "" + echo " Certificates to obtain:" + for entry in "${DOMAINS_TO_PROCESS[@]}"; do + d="${entry%%:*}" + s="${entry##*:}" + echo " ${d} -> secret '${s}'" + done + echo "" + if [[ ${#DOMAINS_TO_PROCESS[@]} -gt 1 ]]; then + echo " Certbot will run once per domain. Each requires a separate DNS TXT record." + echo "" + fi + read -r -p " Press Enter to continue (or Ctrl-C to abort)..." + echo "" + + # Process each domain + FAILED=() + for entry in "${DOMAINS_TO_PROCESS[@]}"; do + domain="${entry%%:*}" + secret_name="${entry##*:}" + + echo "" + echo "========================================" + echo " Certificate: ${domain}" + echo " Secret: ${secret_name}" + echo "========================================" + echo "" + + mkdir -p "${CERT_DIR}/work" "${CERT_DIR}/logs" + + echo "Certbot will ask you to create a DNS TXT record." + echo "When prompted:" + echo " 1. Log in to your DNS provider" + echo " 2. Create a TXT record for _acme-challenge.${domain}" + echo " 3. Wait for DNS propagation (1-5 minutes)" + echo " 4. Press Enter in this terminal to continue" + echo "" + log_info "Starting certbot for ${domain}..." + + if ! certbot certonly \ + --manual \ + --preferred-challenges dns \ + -d "${domain}" \ + --email "${TLS_EMAIL}" \ + --agree-tos \ + --no-eff-email \ + --config-dir "${CERT_DIR}" \ + --work-dir "${CERT_DIR}/work" \ + --logs-dir "${CERT_DIR}/logs"; then + log_error "certbot failed for ${domain}. Check the output above." + FAILED+=("$domain") + continue + fi + + cert_path="${CERT_DIR}/live/${domain}/fullchain.pem" + key_path="${CERT_DIR}/live/${domain}/privkey.pem" + + if [[ ! -f "$cert_path" || ! -f "$key_path" ]]; then + log_error "Certificate files not found for ${domain}." + echo " Expected cert: ${cert_path}" + echo " Expected key: ${key_path}" + FAILED+=("$domain") + continue + fi + + log_success "Certificate obtained for ${domain}" + echo " Full chain: ${cert_path}" + echo " Private key: ${key_path}" + echo "" + log_info "Certificate details:" + openssl x509 -in "${cert_path}" -noout -subject -issuer -dates 2>/dev/null || true + + # Create K8s TLS secrets in both namespaces + create_tls_secret_from_files "$secret_name" "$cert_path" "$key_path" + done + + if [[ ${#FAILED[@]} -gt 0 ]]; then + log_warning "Some certificates failed:" + for d in "${FAILED[@]}"; do + echo " - ${d}" + done + echo " Fix the issues above and re-run this script." + fi + +# ############################################################################# +# CERT-MANAGER PATH +# ############################################################################# else - helm install cert-manager jetstack/cert-manager \ - --namespace cert-manager --create-namespace \ - --set crds.enabled=true \ - --wait --timeout 5m -fi -log_success "cert-manager ready" -# ----------------------------------------------------------------------------- -# Step 2: Create Let's Encrypt ClusterIssuer -# ----------------------------------------------------------------------------- -TLS_EMAIL="${OSMO_TLS_EMAIL:-noreply@${HOSTNAME#*.}}" -log_info "Creating Let's Encrypt ClusterIssuer (email: ${TLS_EMAIL})..." + check_helm || exit 1 + + # ------------------------------------------------------------------------- + # Install cert-manager + # ------------------------------------------------------------------------- + log_info "Installing cert-manager..." + helm repo add jetstack https://charts.jetstack.io --force-update + helm repo update jetstack + + if helm status cert-manager -n cert-manager &>/dev/null; then + log_info "cert-manager already installed" + else + helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager --create-namespace \ + --set crds.enabled=true \ + --wait --timeout 5m + fi + log_success "cert-manager ready" + + # ------------------------------------------------------------------------- + # Create Let's Encrypt ClusterIssuer + # ------------------------------------------------------------------------- + TLS_EMAIL="${OSMO_TLS_EMAIL:-${LETSENCRYPT_EMAIL:-noreply@${MAIN_HOSTNAME#*.}}}" + log_info "Creating Let's Encrypt ClusterIssuer (email: ${TLS_EMAIL})..." -kubectl apply -f - </dev/null); do - ing_name="${ing#*/}" - CURRENT_HTTP=$(kubectl get "$ing" -n "${OSMO_NS}" -o jsonpath='{.spec.rules[0].http}') + for ing in $(kubectl get ingress -n "${OSMO_NS}" -o name 2>/dev/null); do + ing_name="${ing#*/}" + CURRENT_HTTP=$(kubectl get "$ing" -n "${OSMO_NS}" -o jsonpath='{.spec.rules[0].http}') - kubectl patch "$ing" -n "${OSMO_NS}" --type=merge -p "$(cat </dev/null || echo "") - if [[ "$CERT_READY" == "True" ]]; then - log_success "TLS certificate issued and ready" - break + # Wait for main certificate + log_info "Waiting for TLS certificate to be issued (up to 120s)..." + CERT_READY="" + for i in $(seq 1 24); do + CERT_READY=$(kubectl get certificate "${TLS_SECRET}" -n "${OSMO_NS}" \ + -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "") + if [[ "$CERT_READY" == "True" ]]; then + log_success "TLS certificate issued and ready" + break + fi + sleep 5 + done + + if [[ "$CERT_READY" != "True" ]]; then + log_warning "Certificate not ready yet. Checking status..." + kubectl describe certificate "${TLS_SECRET}" -n "${OSMO_NS}" 2>/dev/null | tail -10 + echo "" + log_info "It may take a few more minutes. Check with:" + echo " kubectl get certificate -n ${OSMO_NS}" + echo " kubectl describe challenge -n ${OSMO_NS}" fi - sleep 5 -done -if [[ "$CERT_READY" != "True" ]]; then - log_warning "Certificate not ready yet. Checking status..." - kubectl describe certificate "${TLS_SECRET}" -n "${OSMO_NS}" 2>/dev/null | tail -10 - echo "" - log_info "It may take a few more minutes. Check with:" - echo " kubectl get certificate -n ${OSMO_NS}" - echo " kubectl describe challenge -n ${OSMO_NS}" -fi + # Copy main cert secret to ingress namespace if needed + copy_secret_across_namespaces "${TLS_SECRET}" -# ----------------------------------------------------------------------------- -# Step 4b: Issue TLS certificate for Keycloak auth subdomain (if DEPLOY_KEYCLOAK=true) -# ----------------------------------------------------------------------------- -if [[ -n "$AUTH_HOSTNAME" ]]; then - log_info "Issuing TLS certificate for Keycloak auth subdomain: ${AUTH_HOSTNAME}..." + # ------------------------------------------------------------------------- + # Issue TLS certificate for Keycloak auth subdomain + # ------------------------------------------------------------------------- + if [[ -n "$AUTH_HOSTNAME" ]]; then + log_info "Issuing TLS certificate for Keycloak auth subdomain: ${AUTH_HOSTNAME}..." - # Create bootstrap Ingress for auth subdomain (to trigger HTTP-01 challenge) - kubectl apply -f - </dev/null || echo "") + if [[ "$AUTH_CERT_READY" == "True" ]]; then + log_success "Auth TLS certificate issued and ready" + break + fi + sleep 5 + done - # Wait for auth certificate - log_info "Waiting for auth TLS certificate to be issued (up to 120s)..." - AUTH_CERT_READY="" - for i in $(seq 1 24); do - AUTH_CERT_READY=$(kubectl get certificate "${KC_TLS_SECRET}" -n "${OSMO_NS}" \ - -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "") - if [[ "$AUTH_CERT_READY" == "True" ]]; then - log_success "Auth TLS certificate issued and ready" - break + if [[ "$AUTH_CERT_READY" != "True" ]]; then + log_warning "Auth certificate not ready yet. It may take a few more minutes." + log_info "Check with: kubectl get certificate ${KC_TLS_SECRET} -n ${OSMO_NS}" fi - sleep 5 - done - if [[ "$AUTH_CERT_READY" != "True" ]]; then - log_warning "Auth certificate not ready yet. It may take a few more minutes." - log_info "Check with: kubectl get certificate ${KC_TLS_SECRET} -n ${OSMO_NS}" + # Copy auth cert secret to ingress namespace if needed + copy_secret_across_namespaces "${KC_TLS_SECRET}" + + # Clean up bootstrap Ingress (prevents NGINX admission webhook conflicts) + log_info "Removing auth bootstrap ingress (certificate provisioned)..." + kubectl delete ingress osmo-tls-auth-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null fi - # Clean up the bootstrap Ingress once the certificate is issued. - # If left in place, the NGINX admission webhook will reject any Helm chart - # (e.g. Keycloak) that tries to create an ingress for the same host+path. - log_info "Removing auth bootstrap ingress (certificate provisioned)..." - kubectl delete ingress osmo-tls-auth-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null -fi + # Clean up main bootstrap Ingress + log_info "Removing main bootstrap ingress (certificate provisioned)..." + kubectl delete ingress osmo-tls-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null -# ----------------------------------------------------------------------------- -# Step 5: Update OSMO service_base_url to HTTPS (only if OSMO is deployed) -# ----------------------------------------------------------------------------- +fi # end TLS_MODE + +# ============================================================================= +# Update OSMO service_base_url to HTTPS (only if OSMO is already deployed) +# ============================================================================= if [[ "$OSMO_DEPLOYED" == "true" ]]; then - log_info "Updating OSMO service_base_url to https://${HOSTNAME}..." + log_info "Updating OSMO service_base_url to https://${MAIN_HOSTNAME}..." kubectl port-forward -n "${OSMO_NS}" svc/osmo-service 8080:80 &>/dev/null & _PF_PID=$! trap 'kill $_PF_PID 2>/dev/null; wait $_PF_PID 2>/dev/null' EXIT - # Wait for port-forward _pf_ready=false for i in $(seq 1 15); do if curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; then @@ -374,7 +592,7 @@ if [[ "$OSMO_DEPLOYED" == "true" ]]; then if osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then cat > /tmp/service_url_tls.json </dev/null; then @@ -382,60 +600,55 @@ SVCEOF log_success "service_base_url updated to: ${NEW_URL}" else log_warning "Could not update service_base_url automatically." - log_info "Run: ./08-configure-service-url.sh https://${HOSTNAME}" + log_info "Run: ./08-configure-service-url.sh https://${MAIN_HOSTNAME}" fi rm -f /tmp/service_url_tls.json else log_warning "Could not login to OSMO API. Update service_base_url manually:" - log_info " ./08-configure-service-url.sh https://${HOSTNAME}" + log_info " ./08-configure-service-url.sh https://${MAIN_HOSTNAME}" fi else log_warning "Could not connect to OSMO API. Update service_base_url manually:" - log_info " ./08-configure-service-url.sh https://${HOSTNAME}" + log_info " ./08-configure-service-url.sh https://${MAIN_HOSTNAME}" fi else log_info "Skipping service_base_url update (OSMO not deployed yet)" - log_info "Step 05 will auto-detect TLS and use https:// for service_base_url" + log_info "04-deploy-osmo-control-plane.sh will auto-detect TLS and use https://" fi -# ----------------------------------------------------------------------------- -# Step 6: Clean up bootstrap Ingress (certificate already provisioned) -# ----------------------------------------------------------------------------- -# Always remove the bootstrap ingress once certs are issued. If left in place, -# the NGINX admission webhook will reject any Helm chart (e.g. osmo-ui) that -# tries to create an ingress for the same host+path. -log_info "Removing main bootstrap ingress (certificate provisioned)..." -kubectl delete ingress osmo-tls-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null - -# ----------------------------------------------------------------------------- +# ============================================================================= # Done -# ----------------------------------------------------------------------------- +# ============================================================================= echo "" echo "========================================" -log_success "TLS setup complete" +log_success "TLS setup complete (${TLS_MODE})" echo "========================================" echo "" if [[ "$OSMO_DEPLOYED" == "true" ]]; then echo "OSMO is now accessible at:" - echo " https://${HOSTNAME}" - echo " https://${HOSTNAME}/api/version" + echo " https://${MAIN_HOSTNAME}" + echo " https://${MAIN_HOSTNAME}/api/version" echo "" echo "CLI login:" - echo " osmo login https://${HOSTNAME} --method dev --username admin" + echo " osmo login https://${MAIN_HOSTNAME} --method dev --username admin" else - echo "TLS certificate prepared for: ${HOSTNAME}" + echo "TLS certificate prepared for: ${MAIN_HOSTNAME}" if [[ -n "$AUTH_HOSTNAME" ]]; then echo "Auth TLS certificate prepared for: ${AUTH_HOSTNAME}" fi echo "" echo "Next steps:" - echo " 1. Wait for certificate(s) to be ready: kubectl get certificate -n ${OSMO_NS}" - echo " 2. Deploy OSMO: ./05-deploy-osmo-control-plane.sh" + if [[ "$TLS_MODE" == "cert-manager" ]]; then + echo " 1. Wait for certificate(s) to be ready: kubectl get certificate -n ${OSMO_NS}" + else + echo " 1. Certificates stored in: ${CERT_DIR}" + echo " Renewal: re-run this script before the 90-day expiry" + fi + echo " 2. Deploy OSMO: ./04-deploy-osmo-control-plane.sh" echo " (It will auto-detect the TLS cert and create HTTPS Ingress)" if [[ -n "$AUTH_HOSTNAME" ]]; then - echo " 3. Deploy with Keycloak: DEPLOY_KEYCLOAK=true ./05-deploy-osmo-control-plane.sh" - echo " (Keycloak will be exposed at https://${AUTH_HOSTNAME})" + echo " 3. Keycloak will be exposed at https://${AUTH_HOSTNAME}" fi fi echo "" diff --git a/applications/osmo/deploy/002a-setup/05-deploy-osmo-backend.sh b/applications/osmo/deploy/002-setup/05-deploy-osmo-backend.sh similarity index 100% rename from applications/osmo/deploy/002a-setup/05-deploy-osmo-backend.sh rename to applications/osmo/deploy/002-setup/05-deploy-osmo-backend.sh diff --git a/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh b/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh deleted file mode 100755 index 2b2f28cd1..000000000 --- a/applications/osmo/deploy/002-setup/05-deploy-osmo-control-plane.sh +++ /dev/null @@ -1,2051 +0,0 @@ -#!/bin/bash -# -# Deploy OSMO Service (Control Plane) -# https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html -# -# Components: API Service, Router, Web UI, Worker, Logger, Agent, Keycloak -# - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/lib/common.sh" -source "${SCRIPT_DIR}/defaults.sh" - -echo "" -echo "========================================" -echo " OSMO Service Deployment" -echo "========================================" -echo "" - -# Check prerequisites -check_kubectl || exit 1 -check_helm || exit 1 - -# ----------------------------------------------------------------------------- -# Configuration -# ----------------------------------------------------------------------------- -OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" -# Deploy Keycloak in same namespace as PostgreSQL to simplify DNS resolution -KEYCLOAK_NAMESPACE="${OSMO_NAMESPACE}" -OSMO_DOMAIN="${OSMO_DOMAIN:-osmo.local}" - -# Keycloak admin password - check for existing secret first to maintain consistency -if [[ -z "${KEYCLOAK_ADMIN_PASSWORD:-}" ]]; then - # Try to get existing password from secret - EXISTING_KC_PASS=$(kubectl get secret keycloak-admin-secret -n "${OSMO_NAMESPACE}" -o jsonpath='{.data.password}' 2>/dev/null | base64 -d 2>/dev/null || true) - if [[ -n "${EXISTING_KC_PASS}" ]]; then - KEYCLOAK_ADMIN_PASSWORD="${EXISTING_KC_PASS}" - log_info "Using existing Keycloak admin password from secret" - else - KEYCLOAK_ADMIN_PASSWORD="$(openssl rand -base64 12)" - log_info "Generated new Keycloak admin password" - fi -fi - -# ----------------------------------------------------------------------------- -# Get Database Configuration from Terraform (Nebius Managed PostgreSQL) -# ----------------------------------------------------------------------------- -log_info "Using Nebius Managed PostgreSQL..." - log_info "Retrieving database configuration..." - - # Get connection details from Terraform outputs - POSTGRES_HOST=$(get_tf_output "postgresql.host" "../001-iac" || echo "") - POSTGRES_PORT=$(get_tf_output "postgresql.port" "../001-iac" || echo "5432") - POSTGRES_DB=$(get_tf_output "postgresql.database" "../001-iac" || echo "osmo") - POSTGRES_USER=$(get_tf_output "postgresql.username" "../001-iac" || echo "osmo_admin") - - # Get password - try MysteryBox first, then Terraform output, then env vars - # MysteryBox secret ID is set by secrets-init.sh as TF_VAR_postgresql_mysterybox_secret_id - POSTGRES_SECRET_ID="${TF_VAR_postgresql_mysterybox_secret_id:-${OSMO_POSTGRESQL_SECRET_ID:-}}" - - if [[ -n "$POSTGRES_SECRET_ID" ]]; then - log_info "Reading PostgreSQL password from MysteryBox (secret: $POSTGRES_SECRET_ID)..." - POSTGRES_PASSWORD=$(get_mysterybox_secret "$POSTGRES_SECRET_ID" "password" || echo "") - if [[ -n "$POSTGRES_PASSWORD" ]]; then - log_success "PostgreSQL password retrieved from MysteryBox" - else - log_warning "Failed to read password from MysteryBox" - fi - fi - - # Fall back to Terraform output (only works if not using MysteryBox) - if [[ -z "$POSTGRES_PASSWORD" ]]; then - POSTGRES_PASSWORD=$(get_tf_output "postgresql_password" "../001-iac" || echo "") - fi - - # Fall back to environment variables or prompt - if [[ -z "$POSTGRES_HOST" || -z "$POSTGRES_PASSWORD" ]]; then - log_warning "Could not retrieve PostgreSQL configuration automatically" - log_info "Checking environment variables..." - - POSTGRES_HOST=${POSTGRES_HOST:-${OSMO_POSTGRES_HOST:-""}} - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-${OSMO_POSTGRES_PASSWORD:-""}} - - if [[ -z "$POSTGRES_HOST" ]]; then - read_prompt_var "PostgreSQL Host" POSTGRES_HOST "" - fi - if [[ -z "$POSTGRES_PASSWORD" ]]; then - read_secret_var "PostgreSQL Password" POSTGRES_PASSWORD - fi - fi - -log_success "Database: ${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}" - -# ----------------------------------------------------------------------------- -# Get Storage Configuration -# ----------------------------------------------------------------------------- -log_info "Retrieving storage configuration..." - -S3_BUCKET=$(get_tf_output "storage_bucket.name" "../001-iac" || echo "") -S3_ENDPOINT=$(get_tf_output "storage_bucket.endpoint" "../001-iac" || echo "https://storage.eu-north1.nebius.cloud") -S3_ACCESS_KEY=$(get_tf_output "storage_credentials.access_key_id" "../001-iac" || echo "") - -# Secret access key is stored in MysteryBox (ephemeral, not in Terraform state) -S3_SECRET_REF_ID=$(get_tf_output "storage_secret_reference_id" "../001-iac" || echo "") -S3_SECRET_KEY="" - -if [[ -n "$S3_SECRET_REF_ID" ]]; then - log_info "Retrieving storage secret from MysteryBox..." - # IAM access key secrets are stored with key "secret" in MysteryBox - S3_SECRET_KEY=$(get_mysterybox_secret "$S3_SECRET_REF_ID" "secret" || echo "") - if [[ -n "$S3_SECRET_KEY" ]]; then - log_success "Storage secret retrieved from MysteryBox" - else - log_warning "Could not retrieve storage secret from MysteryBox" - fi -fi - -if [[ -n "$S3_BUCKET" ]]; then - log_success "Storage: ${S3_BUCKET} @ ${S3_ENDPOINT}" -fi - -# ----------------------------------------------------------------------------- -# Add Helm Repositories -# ----------------------------------------------------------------------------- -log_info "Adding Helm repositories..." -helm repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo --force-update -helm repo add bitnami https://charts.bitnami.com/bitnami --force-update -helm repo update - -# ----------------------------------------------------------------------------- -# Step 1: Create Namespaces -# ----------------------------------------------------------------------------- -log_info "Creating namespace..." -kubectl create namespace "${OSMO_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - -# Note: Keycloak is deployed in the same namespace as OSMO (no separate namespace needed) - -# ----------------------------------------------------------------------------- -# Step 2: Configure PostgreSQL - Verify Connection and Create Databases -# ----------------------------------------------------------------------------- -log_info "Verifying PostgreSQL connection..." - - # Delete any existing test/init pods - kubectl delete pod osmo-db-test -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null - kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null - - # Create a temporary secret with DB credentials - # NOTE: PGDATABASE must be the bootstrap database ('osmo') for Nebius MSP PostgreSQL - kubectl create secret generic osmo-db-init-creds \ - --namespace "${OSMO_NAMESPACE}" \ - --from-literal=PGPASSWORD="${POSTGRES_PASSWORD}" \ - --from-literal=PGHOST="${POSTGRES_HOST}" \ - --from-literal=PGPORT="${POSTGRES_PORT}" \ - --from-literal=PGUSER="${POSTGRES_USER}" \ - --from-literal=PGDATABASE="${POSTGRES_DB}" \ - --dry-run=client -o yaml | kubectl apply -f - - - # ----------------------------------------------------------------------------- - # Connection Test - Verify credentials before proceeding - # ----------------------------------------------------------------------------- - log_info "Testing PostgreSQL connection (this may take a moment)..." - - kubectl run osmo-db-test \ - --namespace "${OSMO_NAMESPACE}" \ - --image=postgres:16-alpine \ - --restart=Never \ - --env="PGPASSWORD=${POSTGRES_PASSWORD}" \ - --env="PGHOST=${POSTGRES_HOST}" \ - --env="PGPORT=${POSTGRES_PORT}" \ - --env="PGUSER=${POSTGRES_USER}" \ - --env="PGDATABASE=${POSTGRES_DB}" \ - --command -- sh -c 'psql -c "SELECT 1" >/dev/null 2>&1 && echo "CONNECTION_OK" || echo "CONNECTION_FAILED"' \ - >/dev/null 2>&1 - - # Wait for test pod to complete - test_elapsed=0 - test_status="" - while [[ $test_elapsed -lt 60 ]]; do - test_status=$(kubectl get pod osmo-db-test -n "${OSMO_NAMESPACE}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Pending") - if [[ "$test_status" == "Succeeded" || "$test_status" == "Failed" ]]; then - break - fi - sleep 2 - ((test_elapsed += 2)) - done - - # Check test result - test_result=$(kubectl logs osmo-db-test -n "${OSMO_NAMESPACE}" 2>/dev/null || echo "") - kubectl delete pod osmo-db-test -n "${OSMO_NAMESPACE}" --ignore-not-found >/dev/null 2>&1 - - if [[ "$test_result" != *"CONNECTION_OK"* ]]; then - log_error "PostgreSQL connection test failed!" - echo "" - echo "Connection details:" - echo " Host: ${POSTGRES_HOST}" - echo " Port: ${POSTGRES_PORT}" - echo " Database: ${POSTGRES_DB}" - echo " User: ${POSTGRES_USER}" - echo " Password: (from MysteryBox secret ${TF_VAR_postgresql_mysterybox_secret_id:-'not set'})" - echo "" - echo "Possible causes:" - echo " 1. Password mismatch - MysteryBox password doesn't match PostgreSQL" - echo " Fix: Update MysteryBox or recreate PostgreSQL cluster" - echo " 2. Network issue - Cluster cannot reach PostgreSQL" - echo " 3. PostgreSQL not ready - Wait and retry" - echo "" - echo "To debug manually:" - echo " kubectl run psql-debug --rm -it --image=postgres:16-alpine -n osmo -- sh" - echo " PGPASSWORD='' psql -h ${POSTGRES_HOST} -U ${POSTGRES_USER} -d ${POSTGRES_DB}" - exit 1 - fi - - log_success "PostgreSQL connection verified" - - # ----------------------------------------------------------------------------- - # Database Creation - # ----------------------------------------------------------------------------- - if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then - log_info "Creating OSMO and Keycloak databases (if not exist)..." - else - log_info "Verifying OSMO database..." - fi - - # NOTE: Nebius MSP PostgreSQL creates the bootstrap database ('osmo') automatically. - # The bootstrap user can only connect to this database, not 'postgres'. - # We connect to 'osmo' and create additional databases from there. - # Pass DEPLOY_KEYCLOAK to the init pod - kubectl apply -n "${OSMO_NAMESPACE}" -f - </dev/null 2>&1; then - echo "ERROR: Cannot connect to PostgreSQL" - echo "Debug: PGHOST=\$PGHOST, PGPORT=\$PGPORT, PGUSER=\$PGUSER, PGDATABASE=\${PGDATABASE:-osmo}" - # Try with verbose error - psql -d "\${PGDATABASE:-osmo}" -c "SELECT 1" 2>&1 || true - exit 1 - fi - echo "Connection successful to database '\${PGDATABASE:-osmo}'" - - # The 'osmo' database already exists (created by Nebius bootstrap) - echo "Database 'osmo' exists (created by Nebius MSP bootstrap)" - - # Create keycloak database only if Keycloak deployment is enabled - DEPLOY_KEYCLOAK="${DEPLOY_KEYCLOAK:-false}" - if [ "\$DEPLOY_KEYCLOAK" = "true" ]; then - # Note: This requires the user to have CREATEDB privilege - if psql -d "\${PGDATABASE:-osmo}" -tAc "SELECT 1 FROM pg_database WHERE datname='keycloak'" | grep -q 1; then - echo "Database 'keycloak' already exists" - else - echo "Creating database 'keycloak'..." - psql -d "\${PGDATABASE:-osmo}" -c "CREATE DATABASE keycloak;" || { - echo "WARNING: Could not create 'keycloak' database." - echo "The bootstrap user may not have CREATEDB privilege." - echo "Keycloak will use a schema in the 'osmo' database instead." - } - fi - fi - - # Verify databases exist - echo "" - echo "Verifying databases..." - psql -d "\${PGDATABASE:-osmo}" -c "\l" | grep -E "osmo" || true - - echo "" - echo "SUCCESS: Database initialization complete" - restartPolicy: Never -EOF - - # Wait for pod to complete (init pods may finish before Ready condition is detected) - log_info "Running database initialization..." - - # Poll for completion - init pods go directly to Completed/Succeeded very quickly - max_wait=120 - elapsed=0 - pod_status="" - - while [[ $elapsed -lt $max_wait ]]; do - pod_status=$(kubectl get pod osmo-db-init -n "${OSMO_NAMESPACE}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Pending") - - if [[ "$pod_status" == "Succeeded" ]]; then - break - elif [[ "$pod_status" == "Failed" ]]; then - log_error "Database initialization failed. Checking logs..." - kubectl logs osmo-db-init -n "${OSMO_NAMESPACE}" - kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found - exit 1 - fi - - sleep 2 - ((elapsed += 2)) - done - - if [[ "$pod_status" != "Succeeded" ]]; then - log_error "Database initialization timed out (status: $pod_status). Checking logs..." - kubectl logs osmo-db-init -n "${OSMO_NAMESPACE}" 2>/dev/null || true - kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found - exit 1 - fi - - # Show logs for verification - log_info "Database initialization output:" - kubectl logs osmo-db-init -n "${OSMO_NAMESPACE}" - - # Cleanup - kubectl delete pod osmo-db-init -n "${OSMO_NAMESPACE}" --ignore-not-found - -log_success "Databases verified and ready" - -# ----------------------------------------------------------------------------- -# Step 3: Create Secrets -# ----------------------------------------------------------------------------- -log_info "Creating secrets..." - -# Database secret for Keycloak (only if Keycloak is being deployed) -if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then - kubectl create secret generic keycloak-db-secret \ - --namespace "${KEYCLOAK_NAMESPACE}" \ - --from-literal=postgres-password="${POSTGRES_PASSWORD}" \ - --dry-run=client -o yaml | kubectl apply -f - -fi - -# Create the postgres-secret that OSMO chart expects -# The chart looks for passwordSecretName: postgres-secret, passwordSecretKey: password -kubectl create secret generic postgres-secret \ - --namespace "${OSMO_NAMESPACE}" \ - --from-literal=password="${POSTGRES_PASSWORD}" \ - --dry-run=client -o yaml | kubectl apply -f - - -# OIDC secrets (only needed if Keycloak is deployed) -# These are placeholder values that get overwritten with real Keycloak client secrets -if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then - HMAC_SECRET=$(openssl rand -base64 32) - CLIENT_SECRET=$(openssl rand -base64 32) - kubectl create secret generic oidc-secrets \ - --namespace "${OSMO_NAMESPACE}" \ - --from-literal=client_secret="${CLIENT_SECRET}" \ - --from-literal=hmac_secret="${HMAC_SECRET}" \ - --dry-run=client -o yaml | kubectl apply -f - -fi - -# Storage secret (if available) -if [[ -n "$S3_ACCESS_KEY" && -n "$S3_SECRET_KEY" ]]; then - kubectl create secret generic osmo-storage \ - --namespace "${OSMO_NAMESPACE}" \ - --from-literal=access-key-id="${S3_ACCESS_KEY}" \ - --from-literal=secret-access-key="${S3_SECRET_KEY}" \ - --dry-run=client -o yaml | kubectl apply -f - -fi - -# MEK (Master Encryption Key) Configuration -# OSMO expects MEK in JWK (JSON Web Key) format, base64-encoded -# Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html -MEK_ID="${MEK_ID:-key1}" -log_info "Configuring MEK (Master Encryption Key)..." - -# Try to read MEK from MysteryBox first (set by secrets-init.sh) -# MysteryBox secret ID is set as TF_VAR_mek_mysterybox_secret_id -MEK_SECRET_ID="${TF_VAR_mek_mysterybox_secret_id:-${OSMO_MEK_SECRET_ID:-}}" -MEK_DATA="" - -if [[ -n "$MEK_SECRET_ID" ]]; then - log_info "Reading MEK from MysteryBox (secret: $MEK_SECRET_ID)..." - MEK_DATA=$(get_mysterybox_secret "$MEK_SECRET_ID" "mek" || echo "") - if [[ -n "$MEK_DATA" ]]; then - log_success "MEK retrieved from MysteryBox" - # MEK from secrets-init.sh is in format: {"currentMek":"key1","meks":{"key1":""}} - # Extract the key ID and encoded value - MEK_ID=$(echo "$MEK_DATA" | jq -r '.currentMek // "key1"' 2>/dev/null || echo "key1") - MEK_ENCODED=$(echo "$MEK_DATA" | jq -r ".meks.${MEK_ID} // empty" 2>/dev/null || echo "") - - if [[ -z "$MEK_ENCODED" ]]; then - log_warning "Could not parse MEK from MysteryBox, will generate new one" - MEK_DATA="" - fi - else - log_warning "Failed to read MEK from MysteryBox" - fi -fi - -# Generate new MEK if not retrieved from MysteryBox -if [[ -z "$MEK_DATA" || -z "$MEK_ENCODED" ]]; then - log_info "Generating new MEK in JWK format..." - MEK_KEY_RAW="$(openssl rand -base64 32 | tr -d '\n')" - MEK_JWK="{\"k\":\"${MEK_KEY_RAW}\",\"kid\":\"${MEK_ID}\",\"kty\":\"oct\"}" - MEK_ENCODED="$(echo -n "$MEK_JWK" | base64 | tr -d '\n')" - log_success "New MEK generated" -fi - -# Create MEK ConfigMap (OSMO expects ConfigMap, not Secret) -kubectl apply -n "${OSMO_NAMESPACE}" -f - </dev/null || true" \ - 2>/dev/null || true - kubectl wait --for=condition=Ready pod/osmo-mek-key-reset -n "${OSMO_NAMESPACE}" --timeout=30s 2>/dev/null || true - sleep 5 - kubectl delete pod osmo-mek-key-reset -n "${OSMO_NAMESPACE}" --force 2>/dev/null || true -fi - -# ----------------------------------------------------------------------------- -# Step 3.5: Deploy Redis (Required for OSMO rate limiting) -# ----------------------------------------------------------------------------- -log_info "Deploying Redis..." - -if kubectl get statefulset redis-master -n "${OSMO_NAMESPACE}" &>/dev/null; then - log_info "Redis already deployed" -else - helm upgrade --install redis bitnami/redis \ - --namespace "${OSMO_NAMESPACE}" \ - --set architecture=standalone \ - --set auth.enabled=false \ - --set master.persistence.size=1Gi \ - --set master.resources.requests.cpu=100m \ - --set master.resources.requests.memory=128Mi \ - --wait --timeout 5m - - log_success "Redis deployed" -fi - -REDIS_HOST="redis-master.${OSMO_NAMESPACE}.svc.cluster.local" - -# ----------------------------------------------------------------------------- -# Step 4: Deploy Keycloak (Enable with DEPLOY_KEYCLOAK=true) -# ----------------------------------------------------------------------------- -# Keycloak provides authentication for OSMO -# Required for: osmo login, osmo token, backend operator -# Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak - -# Auto-detect TLS certificate early (needed for KC_EXTERNAL decision below) -TLS_SECRET="${OSMO_TLS_SECRET_NAME:-osmo-tls}" -TLS_ENABLED="false" -if kubectl get secret "${TLS_SECRET}" -n "${OSMO_NAMESPACE}" &>/dev/null; then - log_info "TLS certificate detected (${TLS_SECRET})" - TLS_ENABLED="true" -elif kubectl get certificate "${TLS_SECRET}" -n "${OSMO_NAMESPACE}" &>/dev/null; then - log_info "TLS certificate pending (${TLS_SECRET})" - TLS_ENABLED="true" -fi - -# Keycloak service URL (same namespace as OSMO) -KEYCLOAK_HOST="keycloak.${OSMO_NAMESPACE}.svc.cluster.local" -KEYCLOAK_URL="http://${KEYCLOAK_HOST}:80" - -# Derive Keycloak external hostname -# Priority: KEYCLOAK_HOSTNAME env var > auto-derive from OSMO_INGRESS_HOSTNAME > OSMO_DOMAIN -if [[ -n "${KEYCLOAK_HOSTNAME:-}" ]]; then - AUTH_DOMAIN="${KEYCLOAK_HOSTNAME}" -elif [[ -n "${OSMO_INGRESS_HOSTNAME:-}" ]]; then - AUTH_DOMAIN="auth.${OSMO_INGRESS_HOSTNAME}" -else - AUTH_DOMAIN="auth.${OSMO_DOMAIN}" -fi -KC_TLS_SECRET="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" - -if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then - log_info "Deploying Keycloak for OSMO authentication..." - log_info "Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak" - - # Keycloak database was already created in Step 2 (osmo-db-init pod) when DEPLOY_KEYCLOAK=true - - # ------------------------------------------------------------------------- - # Step 4a: Create secrets for Keycloak - # ------------------------------------------------------------------------- - log_info "Creating Keycloak secrets..." - - # Save admin password to secret for future re-runs - kubectl create secret generic keycloak-admin-secret \ - --namespace "${OSMO_NAMESPACE}" \ - --from-literal=password="${KEYCLOAK_ADMIN_PASSWORD}" \ - --dry-run=client -o yaml | kubectl apply -f - - - # Create keycloak-db-secret for external database (per OSMO docs) - kubectl create secret generic keycloak-db-secret \ - --namespace "${OSMO_NAMESPACE}" \ - --from-literal=postgres-password="${POSTGRES_PASSWORD}" \ - --dry-run=client -o yaml | kubectl apply -f - - - log_success "Keycloak secrets created" - - # ------------------------------------------------------------------------- - # Step 4b: Determine if Keycloak should use external TLS ingress - # ------------------------------------------------------------------------- - KC_EXTERNAL="false" - if [[ "$TLS_ENABLED" == "true" && -n "${OSMO_INGRESS_HOSTNAME:-}" ]]; then - # Check TLS secret for auth domain exists - if kubectl get secret "${KC_TLS_SECRET}" -n "${OSMO_NAMESPACE}" &>/dev/null || \ - kubectl get secret "${KC_TLS_SECRET}" -n "${INGRESS_NAMESPACE:-ingress-nginx}" &>/dev/null; then - KC_EXTERNAL="true" - log_info "Keycloak will be exposed externally at: https://${AUTH_DOMAIN}" - elif kubectl get certificate "${KC_TLS_SECRET}" -n "${OSMO_NAMESPACE}" &>/dev/null; then - KC_EXTERNAL="true" - log_info "Keycloak TLS certificate pending — will create external ingress" - else - log_warning "TLS secret '${KC_TLS_SECRET}' for Keycloak not found." - log_warning "Run: DEPLOY_KEYCLOAK=true ./04-enable-tls.sh ${OSMO_INGRESS_HOSTNAME}" - log_warning "Keycloak will be internal-only (port-forward access)" - fi - fi - - # ------------------------------------------------------------------------- - # Step 4c: Install Keycloak using Bitnami Helm chart - # ------------------------------------------------------------------------- - log_info "Installing Keycloak using Bitnami Helm chart..." - - # Add Bitnami repo - helm repo add bitnami https://charts.bitnami.com/bitnami --force-update 2>/dev/null || true - helm repo update bitnami - - # Create keycloak-values.yaml per OSMO documentation - cat > /tmp/keycloak-values.yaml </dev/null || true - - # Install or upgrade Keycloak - helm upgrade --install keycloak bitnami/keycloak \ - --namespace "${OSMO_NAMESPACE}" \ - -f /tmp/keycloak-values.yaml \ - --timeout 10m || { - log_warning "Helm install returned non-zero, checking pod status..." - } - - rm -f /tmp/keycloak-values.yaml - log_success "Keycloak Helm release installed" - - # Wait for Keycloak to be ready - log_info "Waiting for Keycloak to be ready (this may take 3-5 minutes)..." - - # Wait for the pod to exist first - for i in {1..30}; do - if kubectl get pods -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/name=keycloak 2>/dev/null | grep -q keycloak; then - break - fi - echo " Waiting for Keycloak pod to be created... ($i/30)" - sleep 5 - done - - # Now wait for it to be ready - kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=keycloak \ - -n "${OSMO_NAMESPACE}" --timeout=300s || { - log_warning "Keycloak pod not ready yet, checking logs..." - kubectl logs -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/name=keycloak --tail=30 || true - } - - # Additional wait for Keycloak to fully initialize - log_info "Waiting for Keycloak to fully initialize..." - sleep 30 - - # ------------------------------------------------------------------------- - # Step 4c.1: Verify admin password works (handle stale DB) - # ------------------------------------------------------------------------- - # KC_BOOTSTRAP_ADMIN_* only creates the admin user on FIRST database init. - # If the keycloak DB already existed (e.g. from a prior deployment with a - # different password), the bootstrap is a no-op and the stored password - # won't match. We detect this and reset the password via SQL. - log_info "Verifying Keycloak admin password..." - - KC_POD=$(kubectl get pods -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/name=keycloak \ - -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") - - if [[ -n "$KC_POD" ]]; then - KC_TOKEN_RESP=$(kubectl exec -n "${OSMO_NAMESPACE}" "${KC_POD}" -- \ - curl -s -X POST http://localhost:8080/realms/master/protocol/openid-connect/token \ - -d "client_id=admin-cli" \ - -d "username=admin" \ - -d "password=${KEYCLOAK_ADMIN_PASSWORD}" \ - -d "grant_type=password" 2>/dev/null || echo "") - - if echo "$KC_TOKEN_RESP" | grep -q "access_token"; then - log_success "Keycloak admin password verified" - else - log_warning "Admin password mismatch (stale keycloak DB). Resetting via SQL..." - # Use the db-init credentials to reset the admin password in the keycloak DB - # Keycloak 26.x stores bcrypt hashes. We use the Keycloak KC_SPI approach instead: - # Drop and recreate the keycloak database, then restart Keycloak so bootstrap runs fresh. - kubectl delete pod osmo-kc-db-reset -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null - kubectl run osmo-kc-db-reset \ - --namespace "${OSMO_NAMESPACE}" \ - --image=postgres:16-alpine \ - --restart=Never \ - --env="PGPASSWORD=${POSTGRES_PASSWORD}" \ - --env="PGHOST=${POSTGRES_HOST}" \ - --env="PGPORT=${POSTGRES_PORT}" \ - --env="PGUSER=${POSTGRES_USER}" \ - --env="PGDATABASE=${POSTGRES_DB}" \ - --command -- sh -c ' - echo "Dropping keycloak database..." - psql -c "DROP DATABASE IF EXISTS keycloak;" - echo "Recreating keycloak database..." - psql -c "CREATE DATABASE keycloak;" - echo "Done" - ' >/dev/null 2>&1 - - # Wait for reset pod - for i in $(seq 1 30); do - _rst_status=$(kubectl get pod osmo-kc-db-reset -n "${OSMO_NAMESPACE}" \ - -o jsonpath='{.status.phase}' 2>/dev/null || echo "Pending") - [[ "$_rst_status" == "Succeeded" || "$_rst_status" == "Failed" ]] && break - sleep 2 - done - kubectl logs osmo-kc-db-reset -n "${OSMO_NAMESPACE}" 2>/dev/null || true - kubectl delete pod osmo-kc-db-reset -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null - - if [[ "$_rst_status" == "Succeeded" ]]; then - log_info "Keycloak DB reset. Restarting Keycloak pod for fresh bootstrap..." - kubectl delete pod -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/name=keycloak --wait=false - sleep 10 - kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=keycloak \ - -n "${OSMO_NAMESPACE}" --timeout=300s || log_warning "Keycloak pod not ready after restart" - log_info "Waiting for Keycloak to fully initialize after restart..." - sleep 20 - log_success "Keycloak restarted with fresh DB (admin password will match)" - else - log_error "Failed to reset keycloak DB. Admin password may not work." - log_error "Manually reset: psql -h ${POSTGRES_HOST} -U ${POSTGRES_USER} -d ${POSTGRES_DB} -c 'DROP DATABASE keycloak; CREATE DATABASE keycloak;'" - fi - fi - fi - - # ------------------------------------------------------------------------- - # Step 4d: Import OSMO realm using official sample_osmo_realm.json - # ------------------------------------------------------------------------- - log_info "Configuring Keycloak realm using official OSMO realm JSON..." - - # Generate client secret for osmo-browser-flow (confidential client) - OIDC_CLIENT_SECRET=$(openssl rand -hex 16) - - # Determine OSMO base URL for client redirect URIs - if [[ "$KC_EXTERNAL" == "true" ]]; then - OSMO_BASE_URL="https://${OSMO_INGRESS_HOSTNAME}" - else - OSMO_BASE_URL="http://localhost:8080" - fi - - # Upload the official realm JSON as a ConfigMap (so the job can mount it) - log_info "Creating ConfigMap from sample_osmo_realm.json..." - kubectl create configmap keycloak-realm-json \ - --namespace "${OSMO_NAMESPACE}" \ - --from-file=realm.json="${SCRIPT_DIR}/sample_osmo_realm.json" \ - --dry-run=client -o yaml | kubectl apply -f - - - # Create a job to import the realm and configure a test user - cat > /tmp/keycloak-config-job.yaml <8080, - # so /health/ready returns 404. Use /realms/master as readiness check instead. - for i in 1 2 3 4 5 6 7 8 9 10 11 12; do - if curl -s -f "\${KEYCLOAK_URL}/realms/master" > /dev/null 2>&1; then - echo "Keycloak is ready" - break - fi - echo " Attempt \$i: Keycloak not ready yet..." - sleep 15 - done - echo "" - - # -- Step 3: Get admin token -- - echo "=== Step 3: Get admin token ===" - for i in 1 2 3 4 5; do - TOKEN=\$(curl -s -X POST "\${KEYCLOAK_URL}/realms/master/protocol/openid-connect/token" \ - --data-urlencode "client_id=admin-cli" \ - --data-urlencode "username=admin" \ - --data-urlencode "password=${KEYCLOAK_ADMIN_PASSWORD}" \ - --data-urlencode "grant_type=password" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4) - if [ -n "\$TOKEN" ]; then break; fi - echo " Retry \$i: waiting for token..." - sleep 10 - done - - if [ -z "\$TOKEN" ]; then - echo "FATAL: Failed to get admin token" - exit 1 - fi - echo "Got admin token" - echo "" - - # -- Step 4: Import OSMO realm -- - echo "=== Step 4: Import OSMO realm ===" - - # Delete existing realm if present (idempotent re-runs) - REALM_STATUS=\$(curl -s -o /dev/null -w "%{http_code}" "\${KEYCLOAK_URL}/admin/realms/osmo" \ - -H "Authorization: Bearer \$TOKEN") - if [ "\$REALM_STATUS" = "200" ]; then - echo " Existing 'osmo' realm found - deleting for fresh import..." - curl -s -X DELETE "\${KEYCLOAK_URL}/admin/realms/osmo" \ - -H "Authorization: Bearer \$TOKEN" - echo " Old realm deleted" - sleep 5 - fi - - echo "Importing official OSMO realm from sample_osmo_realm.json..." - IMPORT_HTTP=\$(curl -s -o /tmp/import-resp.txt -w "%{http_code}" \ - -X POST "\${KEYCLOAK_URL}/admin/realms" \ - -H "Authorization: Bearer \$TOKEN" \ - -H "Content-Type: application/json" \ - -d @/tmp/realm-import.json) - - if [ "\$IMPORT_HTTP" = "201" ] || [ "\$IMPORT_HTTP" = "204" ]; then - echo "Realm imported successfully (HTTP \$IMPORT_HTTP)" - else - echo "WARNING: Realm import returned HTTP \$IMPORT_HTTP" - cat /tmp/import-resp.txt 2>/dev/null || true - echo "" - echo "Trying partial import as fallback..." - curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/partialImport" \ - -H "Authorization: Bearer \$TOKEN" \ - -H "Content-Type: application/json" \ - -d @/tmp/realm-import.json || echo "Partial import also failed" - fi - - # Verify realm exists - sleep 3 - VERIFY=\$(curl -s -o /dev/null -w "%{http_code}" "\${KEYCLOAK_URL}/admin/realms/osmo" \ - -H "Authorization: Bearer \$TOKEN") - if [ "\$VERIFY" != "200" ]; then - echo "FATAL: Realm 'osmo' not found after import (HTTP \$VERIFY)" - exit 1 - fi - echo "Realm 'osmo' verified" - echo "" - - # -- Step 5: Create test user -- - echo "=== Step 5: Create test user ===" - - # Refresh admin token - TOKEN=\$(curl -s -X POST "\${KEYCLOAK_URL}/realms/master/protocol/openid-connect/token" \ - --data-urlencode "client_id=admin-cli" \ - --data-urlencode "username=admin" \ - --data-urlencode "password=${KEYCLOAK_ADMIN_PASSWORD}" \ - --data-urlencode "grant_type=password" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4) - - echo "Creating osmo-admin test user..." - curl -s -X POST "\${KEYCLOAK_URL}/admin/realms/osmo/users" \ - -H "Authorization: Bearer \$TOKEN" \ - -H "Content-Type: application/json" \ - -d '{ - "username": "osmo-admin", - "enabled": true, - "emailVerified": true, - "firstName": "OSMO", - "lastName": "Admin", - "email": "osmo-admin@example.com", - "credentials": [{"type":"password","value":"osmo-admin","temporary":false}] - }' || echo "User may already exist" - echo "" - - # -- Step 6: Assign user to Admin group -- - echo "=== Step 6: Assign user to Admin group ===" - - USER_ID=\$(curl -s "\${KEYCLOAK_URL}/admin/realms/osmo/users?username=osmo-admin" \ - -H "Authorization: Bearer \$TOKEN" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4) - - if [ -n "\$USER_ID" ]; then - echo " User ID: \$USER_ID" - - ADMIN_GROUP_ID=\$(curl -s "\${KEYCLOAK_URL}/admin/realms/osmo/groups?search=Admin" \ - -H "Authorization: Bearer \$TOKEN" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4) - - if [ -n "\$ADMIN_GROUP_ID" ]; then - echo " Admin Group ID: \$ADMIN_GROUP_ID" - curl -s -X PUT "\${KEYCLOAK_URL}/admin/realms/osmo/users/\${USER_ID}/groups/\${ADMIN_GROUP_ID}" \ - -H "Authorization: Bearer \$TOKEN" \ - -H "Content-Type: application/json" \ - -d '{}' || echo "Failed to assign group" - echo " User 'osmo-admin' assigned to Admin group (osmo-admin + osmo-user roles)" - else - echo " WARNING: Admin group not found - user roles may need manual assignment" - fi - else - echo " WARNING: Could not find osmo-admin user ID" - fi - echo "" - - # -- Done -- - echo "=========================================" - echo " Keycloak OSMO Configuration Complete" - echo "=========================================" - echo "" - echo "Realm: osmo (imported from official sample_osmo_realm.json)" - echo "Clients: osmo-device (public, device code + direct access)" - echo " osmo-browser-flow (confidential, authorization code)" - echo "Groups: Admin, User, Backend Operator" - echo "Roles: osmo-admin, osmo-user, osmo-backend, grafana-*, dashboard-*" - echo "Mappers: JWT 'roles' claim configured on both clients" - echo "Test user: osmo-admin / osmo-admin (Admin group)" - echo "" -EOF - - # Delete any previous config job - kubectl delete job keycloak-osmo-setup -n "${KEYCLOAK_NAMESPACE}" --ignore-not-found 2>/dev/null || true - - kubectl apply -f /tmp/keycloak-config-job.yaml - - log_info "Waiting for Keycloak realm import job..." - kubectl wait --for=condition=complete job/keycloak-osmo-setup \ - -n "${KEYCLOAK_NAMESPACE}" --timeout=300s || { - log_warning "Keycloak configuration may have failed, check logs:" - kubectl logs -n "${KEYCLOAK_NAMESPACE}" -l job-name=keycloak-osmo-setup --tail=50 || true - } - - # Store the client secret for OIDC (used by Envoy sidecar) - kubectl create secret generic oidc-secrets \ - --namespace "${OSMO_NAMESPACE}" \ - --from-literal=client_secret="${OIDC_CLIENT_SECRET}" \ - --from-literal=hmac_secret="$(openssl rand -base64 32)" \ - --dry-run=client -o yaml | kubectl apply -f - - - # Clean up temporary files and ConfigMap - rm -f /tmp/keycloak-config-job.yaml - kubectl delete configmap keycloak-realm-json -n "${OSMO_NAMESPACE}" --ignore-not-found 2>/dev/null || true - - log_success "Keycloak deployed and configured" - echo "" - if [[ "$KC_EXTERNAL" == "true" ]]; then - echo "Keycloak Access (external):" - echo " URL: https://${AUTH_DOMAIN}" - echo " Admin console: https://${AUTH_DOMAIN}/admin" - echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" - echo " Test User: osmo-admin / osmo-admin" - echo "" - echo "OSMO Auth Endpoints:" - echo " Token: https://${AUTH_DOMAIN}/realms/osmo/protocol/openid-connect/token" - echo " Auth: https://${AUTH_DOMAIN}/realms/osmo/protocol/openid-connect/auth" - echo "" - # Enable OSMO auth with Envoy sidecars (production mode) - AUTH_ENABLED="true" - KEYCLOAK_EXTERNAL_URL="https://${AUTH_DOMAIN}" - log_success "OSMO authentication will be ENABLED with Envoy sidecars" - else - echo "Keycloak Access (port-forward only):" - echo " kubectl port-forward -n ${KEYCLOAK_NAMESPACE} svc/keycloak 8081:80" - echo " URL: http://localhost:8081" - echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" - echo " Test User: osmo-admin / osmo-admin" - echo "" - echo "OSMO Auth Endpoints (in-cluster):" - echo " Token: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/token" - echo " Auth: ${KEYCLOAK_URL}/realms/osmo/protocol/openid-connect/auth" - echo "" - # Auth disabled when Keycloak is internal-only - AUTH_ENABLED="false" - KEYCLOAK_EXTERNAL_URL="" - log_info "Note: OSMO auth disabled (Keycloak is internal-only, no TLS ingress)" - log_info "To enable auth, run: DEPLOY_KEYCLOAK=true ./04-enable-tls.sh ${OSMO_INGRESS_HOSTNAME:-}" - fi -else - log_info "Skipping Keycloak (set DEPLOY_KEYCLOAK=true to enable)" - log_warning "Without Keycloak, 'osmo login' and token creation will not work" - log_info "Reference: https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/deploy_service.html#step-2-configure-keycloak" - AUTH_ENABLED="false" - KEYCLOAK_EXTERNAL_URL="" -fi - -# ----------------------------------------------------------------------------- -# Step 5: Create OSMO Values File -# ----------------------------------------------------------------------------- -log_info "Creating OSMO values file..." - -# NGINX Ingress – run 03-deploy-nginx-ingress.sh before this script -# When OSMO_INGRESS_HOSTNAME is empty (default), ingress matches any Host header, -# allowing direct IP-based access. Set it to a real domain for host-based routing. -INGRESS_HOSTNAME="${OSMO_INGRESS_HOSTNAME:-}" -if [[ -n "$INGRESS_HOSTNAME" ]]; then - log_info "Ingress hostname: ${INGRESS_HOSTNAME}" -else - log_info "Ingress hostname: (any — IP-based access)" -fi - -# TLS_SECRET and TLS_ENABLED were already set earlier (before Keycloak section) -if [[ "$TLS_ENABLED" == "true" ]]; then - log_success "TLS certificate detected (${TLS_SECRET}) — will create HTTPS Ingress" -fi - -# Create the values file with proper extraEnv and extraVolumes for each service -# This configures PostgreSQL password via env var and MEK via volume mount -cat > /tmp/osmo_values.yaml < Keycloak) - oauth2Filter: - enabled: true - tokenEndpoint: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/token - authEndpoint: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/auth - clientId: osmo-browser-flow - authProvider: ${AUTH_DOMAIN} - secretName: oidc-secrets - clientSecretKey: client_secret - hmacSecretKey: hmac_secret - - # JWT Filter config -- three providers - jwt: - user_header: x-osmo-user - providers: - # Provider 1: Keycloak device flow (CLI) - - issuer: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo - audience: osmo-device - jwks_uri: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/certs - user_claim: preferred_username - cluster: oauth - # Provider 2: Keycloak browser flow (Web UI) - - issuer: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo - audience: osmo-browser-flow - jwks_uri: ${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect/certs - user_claim: preferred_username - cluster: oauth - # Provider 3: OSMO-signed JWTs (service accounts) - - issuer: osmo - audience: osmo - jwks_uri: http://localhost:8000/api/auth/keys - user_claim: unique_name - cluster: service -ENVOY_ENABLED -else -cat </dev/null || true - -log_info "Deploying OSMO Service..." - -SERVICE_HELM_ARGS=( - --namespace "${OSMO_NAMESPACE}" - -f /tmp/osmo_values.yaml -) -[[ -n "$INGRESS_HOSTNAME" ]] && SERVICE_HELM_ARGS+=(--set "services.service.hostname=${INGRESS_HOSTNAME}") - -helm upgrade --install osmo-service osmo/service \ - "${SERVICE_HELM_ARGS[@]}" \ - --wait --timeout 10m || { - log_warning "OSMO Service deployment had issues" - log_info "Checking pod status..." - kubectl get pods -n "${OSMO_NAMESPACE}" --no-headers | head -10 -} - -log_success "OSMO Service deployed" - -# Create an internal service for osmo-agent that bypasses the Envoy sidecar. -# Backend operators (06-deploy-osmo-backend.sh) connect via this service so they -# can authenticate with OSMO service tokens without needing Envoy/Keycloak JWTs. -log_info "Creating osmo-agent-internal service (bypasses Envoy)..." -kubectl apply -n "${OSMO_NAMESPACE}" -f - < /tmp/osmo_router_values.yaml < /tmp/osmo_ui_values.yaml </dev/null); do - ing_name="${ing#*/}" - [[ "$ing_name" == "osmo-tls-bootstrap" ]] && continue # skip bootstrap ingress - CURRENT_HTTP=$(kubectl get "$ing" -n "${OSMO_NAMESPACE}" -o jsonpath='{.spec.rules[0].http}') - - kubectl patch "$ing" -n "${OSMO_NAMESPACE}" --type=merge -p "$(cat </dev/null - log_success "Ingress TLS patching complete" -fi - -# ----------------------------------------------------------------------------- -# Step 9: Patch Deployments to Add vault-secrets Volume -# ----------------------------------------------------------------------------- -# NOTE: The Helm chart's extraVolumes/extraVolumeMounts values don't work reliably. -# We must patch the deployments after Helm creates them to add the vault-secrets volume. -# This is a known workaround - the env vars work via extraEnv, but volumes don't. - -log_info "Patching OSMO deployments to add vault-secrets volume mount..." - -# Create the JSON patch file -cat > /tmp/vault-patch.json << 'PATCH_EOF' -[ - {"op": "add", "path": "/spec/template/spec/volumes/-", "value": {"name": "vault-secrets", "secret": {"secretName": "vault-secrets"}}}, - {"op": "add", "path": "/spec/template/spec/containers/0/volumeMounts/-", "value": {"name": "vault-secrets", "mountPath": "/home/osmo/vault-agent/secrets", "readOnly": true}} -] -PATCH_EOF - -# All OSMO deployments that need the vault-secrets volume for MEK -OSMO_DEPLOYMENTS="osmo-service osmo-worker osmo-agent osmo-logger osmo-delayed-job-monitor osmo-router" - -for deploy in $OSMO_DEPLOYMENTS; do - if kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" &>/dev/null; then - # Check if vault-secrets volume already exists - EXISTING_VOL=$(kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" \ - -o jsonpath='{.spec.template.spec.volumes[*].name}' 2>/dev/null | grep -w "vault-secrets" || true) - - if [[ -z "$EXISTING_VOL" ]]; then - log_info " Patching $deploy to add vault-secrets volume..." - if kubectl patch deployment/$deploy -n "${OSMO_NAMESPACE}" --type=json --patch-file=/tmp/vault-patch.json; then - log_success " $deploy patched successfully" - else - log_warning " Failed to patch $deploy" - fi - else - log_info " $deploy already has vault-secrets volume, skipping" - fi - else - log_info " $deploy not found, skipping" - fi -done - -# Cleanup patch file -rm -f /tmp/vault-patch.json - -# Wait for rollouts to complete -log_info "Waiting for deployments to roll out with new configuration..." -for deploy in $OSMO_DEPLOYMENTS; do - if kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" &>/dev/null; then - kubectl rollout status deployment/$deploy -n "${OSMO_NAMESPACE}" --timeout=180s || \ - log_warning " Timeout waiting for $deploy rollout" - fi -done - -log_success "All OSMO deployments patched with vault-secrets volume" - -# ----------------------------------------------------------------------------- -# Step 10: Patch Services for Direct Access (without Envoy) -# ----------------------------------------------------------------------------- -# When Envoy sidecar is disabled, services need to target port 8000 directly -# instead of the 'envoy-http' named port which doesn't exist. -# When Envoy IS enabled, the 'envoy-http' targetPort is correct -- skip patching. - -if [[ "$AUTH_ENABLED" == "true" ]]; then - log_info "Envoy sidecar is ENABLED -- skipping targetPort patches (envoy-http is correct)" -else - log_info "Verifying service ports (Envoy disabled)..." - - OSMO_SERVICES="osmo-service osmo-router osmo-logger osmo-agent" - - for svc in $OSMO_SERVICES; do - if kubectl get svc "$svc" -n "${OSMO_NAMESPACE}" &>/dev/null; then - CURRENT_TARGET=$(kubectl get svc "$svc" -n "${OSMO_NAMESPACE}" \ - -o jsonpath='{.spec.ports[0].targetPort}' 2>/dev/null || echo "") - - if [[ "$CURRENT_TARGET" == "envoy-http" || "$CURRENT_TARGET" == "envoy" ]]; then - log_info " Patching $svc: targetPort envoy-http -> 8000" - kubectl patch svc "$svc" -n "${OSMO_NAMESPACE}" --type='json' \ - -p='[{"op": "replace", "path": "/spec/ports/0/targetPort", "value": 8000}]' || \ - log_warning " Failed to patch $svc" - else - log_info " $svc: targetPort = $CURRENT_TARGET (OK)" - fi - fi - done - - log_success "Service ports verified" -fi - -# ----------------------------------------------------------------------------- -# Step 11: Verify Deployment -# ----------------------------------------------------------------------------- -echo "" -log_info "Verifying deployment configuration..." - -# Verify vault-secrets volumes are mounted -echo "" -echo "Volume configuration verification:" -for deploy in $OSMO_DEPLOYMENTS; do - if kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" &>/dev/null; then - VOL_CHECK=$(kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" \ - -o jsonpath='{.spec.template.spec.volumes[*].name}' 2>/dev/null | grep -w "vault-secrets" || echo "") - ENV_CHECK=$(kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" \ - -o jsonpath='{.spec.template.spec.containers[0].env[*].name}' 2>/dev/null | grep -w "OSMO_POSTGRES_PASSWORD" || echo "") - - VOL_STATUS="✗" - ENV_STATUS="✗" - [[ -n "$VOL_CHECK" ]] && VOL_STATUS="✓" - [[ -n "$ENV_CHECK" ]] && ENV_STATUS="✓" - - echo " $deploy: vault-secrets=$VOL_STATUS, postgres_env=$ENV_STATUS" - fi -done - -echo "" -echo "Pods:" -kubectl get pods -n "${OSMO_NAMESPACE}" - -echo "" -echo "Services:" -kubectl get svc -n "${OSMO_NAMESPACE}" - -# ----------------------------------------------------------------------------- -# Step 12: Configure service_base_url (required for workflow execution) -# ----------------------------------------------------------------------------- -# The osmo-ctrl sidecar in every workflow pod needs service_base_url to -# stream logs, report task status, and refresh tokens. -# This is an application-level config that must be set via the OSMO API. - -echo "" -log_info "Configuring service_base_url for workflow execution..." - -# Detect target URL from Ingress -INGRESS_URL=$(detect_service_url 2>/dev/null || true) - -if [[ -n "${OSMO_INGRESS_BASE_URL:-}" ]]; then - TARGET_SERVICE_URL="${OSMO_INGRESS_BASE_URL}" - log_info "Using explicit Ingress base URL: ${TARGET_SERVICE_URL}" -elif [[ "$TLS_ENABLED" == "true" && -n "$INGRESS_HOSTNAME" ]]; then - TARGET_SERVICE_URL="https://${INGRESS_HOSTNAME}" - log_info "TLS detected, using HTTPS: ${TARGET_SERVICE_URL}" -elif [[ -n "$INGRESS_URL" ]]; then - TARGET_SERVICE_URL="${INGRESS_URL}" - log_info "Auto-detected service URL: ${TARGET_SERVICE_URL}" -else - log_warning "Could not detect Ingress URL. Skipping service_base_url configuration." - log_warning "Run ./08-configure-service-url.sh manually after verifying the Ingress." - TARGET_SERVICE_URL="" -fi - -if [[ -n "$TARGET_SERVICE_URL" ]]; then - # Start port-forward using the shared helper (auto-detects Envoy) - start_osmo_port_forward "${OSMO_NAMESPACE}" 8080 - _PF_PID=$PORT_FORWARD_PID - export _OSMO_PORT=8080 - - _cleanup_pf() { - if [[ -n "${_PF_PID:-}" ]]; then - kill $_PF_PID 2>/dev/null || true - wait $_PF_PID 2>/dev/null || true - fi - } - - # Wait for port-forward to be ready - _pf_ready=false - for i in $(seq 1 30); do - if curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; then - _pf_ready=true - break - fi - sleep 1 - done - - if [[ "$_pf_ready" == "true" ]]; then - # Login (no-op when bypassing Envoy -- osmo_curl handles auth headers) - osmo_login 8080 || true - - # Check current value - CURRENT_SVC_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') - - if [[ "$CURRENT_SVC_URL" == "$TARGET_SERVICE_URL" ]]; then - log_success "service_base_url already configured: ${CURRENT_SVC_URL}" - else - if [[ -n "$CURRENT_SVC_URL" && "$CURRENT_SVC_URL" != "null" ]]; then - log_warning "Updating service_base_url from '${CURRENT_SVC_URL}' to '${TARGET_SERVICE_URL}'" - fi - - # Write config using the PATCH API helper - cat > /tmp/service_url_fix.json << SVCEOF -{ - "service_base_url": "${TARGET_SERVICE_URL}" -} -SVCEOF - if osmo_config_update SERVICE /tmp/service_url_fix.json "Set service_base_url for osmo-ctrl sidecar"; then - # Verify - NEW_SVC_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') - if [[ "$NEW_SVC_URL" == "$TARGET_SERVICE_URL" ]]; then - log_success "service_base_url configured: ${NEW_SVC_URL}" - else - log_warning "service_base_url verification failed. Run ./08-configure-service-url.sh manually." - fi - else - log_warning "Failed to set service_base_url. Run ./08-configure-service-url.sh manually." - fi - rm -f /tmp/service_url_fix.json - fi - - # ----------------------------------------------------------------- - # Step 12b: Populate /api/auth/login endpoints (Keycloak discovery) - # ----------------------------------------------------------------- - # When auth.enabled=false, OSMO doesn't populate /api/auth/login from - # Helm values. We PATCH service_auth.login_info so the CLI can discover - # Keycloak endpoints via `osmo login `. - if [[ "$AUTH_ENABLED" == "true" ]]; then - log_info "Populating /api/auth/login with Keycloak endpoints..." - KC_OIDC="${KEYCLOAK_EXTERNAL_URL}/realms/osmo/protocol/openid-connect" - cat > /tmp/login_info_fix.json << LOGINEOF -{ - "service_auth": { - "login_info": { - "device_endpoint": "${KC_OIDC}/auth/device", - "device_client_id": "osmo-device", - "browser_endpoint": "${KC_OIDC}/auth", - "browser_client_id": "osmo-browser-flow", - "token_endpoint": "${KC_OIDC}/token", - "logout_endpoint": "${KC_OIDC}/logout" - } - } -} -LOGINEOF - if osmo_config_update SERVICE /tmp/login_info_fix.json "Populate auth/login endpoints"; then - log_success "/api/auth/login endpoints configured for Keycloak" - else - log_warning "Failed to populate /api/auth/login. CLI login may not auto-discover endpoints." - log_warning "Run: osmo login ${TARGET_SERVICE_URL} --method dev" - fi - rm -f /tmp/login_info_fix.json - fi - else - log_warning "Port-forward not ready. Run ./08-configure-service-url.sh manually." - fi - - _cleanup_pf -fi - -echo "" -echo "========================================" -log_success "OSMO Control Plane deployment complete!" -echo "========================================" -echo "" - -if [[ "$AUTH_ENABLED" == "true" ]]; then - # --- Auth-enabled output --- - echo "Authentication: ENABLED (Keycloak + Envoy sidecars)" - echo "" - echo "Keycloak Admin Console:" - echo " URL: https://${AUTH_DOMAIN}/admin" - echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" - echo "" - echo "OSMO Access:" - if [[ -n "$INGRESS_URL" ]]; then - echo " OSMO API: ${INGRESS_URL}/api/version (unauthenticated -- skipAuthPath)" - echo " OSMO Web UI: ${INGRESS_URL} (redirects to Keycloak login)" - fi - echo "" - echo "Login methods:" - echo " Browser: Visit ${INGRESS_URL:-https://} -- you will be redirected to Keycloak" - echo " CLI: osmo login ${INGRESS_URL:-https://}" - echo " (Opens browser for device authorization flow)" - echo "" - echo "Test user: osmo-admin / osmo-admin" - echo "" - echo "Keycloak realm management (groups, roles, users):" - echo " https://nvidia.github.io/OSMO/main/deployment_guide/appendix/authentication/keycloak_setup.html" - echo "" -else - # --- No-auth output --- - if [[ "$TLS_ENABLED" == "true" && -n "$INGRESS_HOSTNAME" ]]; then - echo "OSMO Access (HTTPS via NGINX Ingress + cert-manager):" - echo " OSMO API: https://${INGRESS_HOSTNAME}/api/version" - echo " OSMO UI: https://${INGRESS_HOSTNAME}" - echo " OSMO CLI: osmo login https://${INGRESS_HOSTNAME} --method dev --username admin" - echo "" - elif [[ -n "$INGRESS_URL" ]]; then - echo "OSMO Access (via NGINX Ingress LoadBalancer):" - echo " OSMO API: ${INGRESS_URL}/api/version" - echo " OSMO UI: ${INGRESS_URL}" - echo " OSMO CLI: osmo login ${INGRESS_URL} --method dev --username admin" - echo "" - else - log_warning "Could not detect Ingress LoadBalancer IP." - echo " Check: kubectl get svc -n ${INGRESS_NAMESPACE:-ingress-nginx}" - echo "" - echo " Fallback (port-forward):" - echo " kubectl port-forward -n ${OSMO_NAMESPACE} svc/osmo-service 8080:80" - echo " URL: http://localhost:8080" - echo "" - fi - - echo "NOTE: OSMO API authentication is DISABLED." - echo " The API is accessible without tokens." - echo " Set DEPLOY_KEYCLOAK=true with TLS to enable Keycloak + Envoy auth." - echo "" - echo "Test the API:" - if [[ -n "$INGRESS_URL" ]]; then - echo " curl ${INGRESS_URL}/api/version" - echo " curl ${INGRESS_URL}/api/workflow" - else - echo " curl http://localhost:8080/api/version" - echo " curl http://localhost:8080/api/workflow" - fi - echo "" - if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]]; then - echo "Keycloak Access (internal only, auth not enforced):" - echo " kubectl port-forward -n ${KEYCLOAK_NAMESPACE} svc/keycloak 8081:80" - echo " URL: http://localhost:8081" - echo " Admin: admin / ${KEYCLOAK_ADMIN_PASSWORD}" - echo " Test User: osmo-admin / osmo-admin" - echo "" - fi -fi - -echo "Ingress resources:" -kubectl get ingress -n "${OSMO_NAMESPACE}" 2>/dev/null || true -echo "" -echo "Next step - Deploy Backend Operator:" -echo " ./06-deploy-osmo-backend.sh" -echo "" diff --git a/applications/osmo/deploy/002a-setup/06-configure-storage.sh b/applications/osmo/deploy/002-setup/06-configure-storage.sh similarity index 100% rename from applications/osmo/deploy/002a-setup/06-configure-storage.sh rename to applications/osmo/deploy/002-setup/06-configure-storage.sh diff --git a/applications/osmo/deploy/002-setup/06-deploy-osmo-backend.sh b/applications/osmo/deploy/002-setup/06-deploy-osmo-backend.sh deleted file mode 100755 index afc0c49d6..000000000 --- a/applications/osmo/deploy/002-setup/06-deploy-osmo-backend.sh +++ /dev/null @@ -1,329 +0,0 @@ -#!/bin/bash -# -# Deploy OSMO Backend Operator -# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/deploy_backend.html -# - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/lib/common.sh" -source "${SCRIPT_DIR}/defaults.sh" - -echo "" -echo "========================================" -echo " OSMO Backend Operator Deployment" -echo "========================================" -echo "" - -# Check prerequisites -check_kubectl || exit 1 -check_helm || exit 1 - -# ----------------------------------------------------------------------------- -# Configuration -# ----------------------------------------------------------------------------- -OSMO_OPERATOR_NAMESPACE="osmo-operator" -OSMO_WORKFLOWS_NAMESPACE="osmo-workflows" -OSMO_IMAGE_TAG="${OSMO_IMAGE_TAG:-6.0.0}" -OSMO_CHART_VERSION="${OSMO_CHART_VERSION:-}" -BACKEND_NAME="${OSMO_BACKEND_NAME:-default}" - -# Check for OSMO Service URL (in-cluster URL for the backend operator pods) -# IMPORTANT: Backend operators connect via WebSocket to osmo-agent, NOT osmo-service! -# The osmo-service handles REST API, osmo-agent handles WebSocket connections for backends -if [[ -z "${OSMO_SERVICE_URL:-}" ]]; then - log_info "Auto-detecting in-cluster OSMO Agent URL..." - - # Backend operators MUST connect to osmo-agent for WebSocket connections. - # Prefer osmo-agent-internal (bypasses Envoy sidecar — no JWT needed for internal comms). - # Falls back to osmo-agent if the internal service doesn't exist. - OSMO_AGENT_INTERNAL=$(kubectl get svc -n osmo osmo-agent-internal -o jsonpath='{.metadata.name}' 2>/dev/null || echo "") - OSMO_AGENT=$(kubectl get svc -n osmo osmo-agent -o jsonpath='{.metadata.name}' 2>/dev/null || echo "") - - if [[ -n "$OSMO_AGENT_INTERNAL" ]]; then - OSMO_SERVICE_URL="http://osmo-agent-internal.osmo.svc.cluster.local:80" - log_success "In-cluster Agent URL (internal, no Envoy): ${OSMO_SERVICE_URL}" - elif [[ -n "$OSMO_AGENT" ]]; then - OSMO_SERVICE_URL="http://osmo-agent.osmo.svc.cluster.local:80" - log_success "In-cluster Agent URL: ${OSMO_SERVICE_URL}" - else - echo "" - log_error "Could not detect OSMO Agent service. Deploy OSMO first: ./05-deploy-osmo-control-plane.sh" - log_error "Note: Backend operators require osmo-agent service for WebSocket connections" - exit 1 - fi -fi - -# Check for OSMO Service Token -if [[ -z "${OSMO_SERVICE_TOKEN:-}" ]]; then - # First, ensure namespace exists so we can check for existing secret - kubectl create namespace "${OSMO_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - 2>/dev/null || true - - # Check if token secret already exists in cluster - EXISTING_TOKEN=$(kubectl get secret osmo-operator-token -n "${OSMO_OPERATOR_NAMESPACE}" -o jsonpath='{.data.token}' 2>/dev/null | base64 -d || echo "") - - if [[ -n "$EXISTING_TOKEN" ]]; then - log_info "Using existing token from secret osmo-operator-token" - OSMO_SERVICE_TOKEN="$EXISTING_TOKEN" - elif command -v osmo &>/dev/null; then - # Check if osmo CLI is already logged in (don't try to login with in-cluster URL) - log_info "Checking if OSMO CLI is already logged in..." - - # Try to generate token - this only works if CLI is already logged in - TOKEN_NAME="backend-token-$(date -u +%Y%m%d%H%M%S)" - EXPIRY_DATE=$(date -u -d "+1 year" +%F 2>/dev/null || date -u -v+1y +%F 2>/dev/null || echo "2027-01-01") - - TOKEN_JSON=$(osmo token set "$TOKEN_NAME" \ - --expires-at "$EXPIRY_DATE" \ - --description "Backend Operator Token" \ - --service --roles osmo-backend -t json 2>/dev/null || echo "") - - if [[ -n "$TOKEN_JSON" ]]; then - OSMO_SERVICE_TOKEN=$(echo "$TOKEN_JSON" | jq -r '.token // empty' 2>/dev/null || echo "") - fi - - if [[ -n "$OSMO_SERVICE_TOKEN" ]]; then - log_success "Service token generated: $TOKEN_NAME (expires: $EXPIRY_DATE)" - fi - fi - - # If still no token, automatically create one using port-forward - if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then - log_info "No token found - automatically creating service token..." - - # Check if osmo CLI is available - if ! command -v osmo &>/dev/null; then - log_error "osmo CLI not found. Please install it first." - exit 1 - fi - - # Start port-forward using shared helper (auto-detects Envoy) - start_osmo_port_forward osmo 8080 - export _OSMO_PORT=8080 - - # Cleanup function to kill port-forward on exit - cleanup_port_forward() { - if [[ -n "${PORT_FORWARD_PID:-}" ]]; then - kill $PORT_FORWARD_PID 2>/dev/null || true - wait $PORT_FORWARD_PID 2>/dev/null || true - fi - } - trap cleanup_port_forward EXIT - - # Wait for port-forward to be ready - log_info "Waiting for port-forward to be ready..." - max_wait=30 - elapsed=0 - while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; do - sleep 1 - ((elapsed += 1)) - if [[ $elapsed -ge $max_wait ]]; then - log_error "Port-forward failed to start within ${max_wait}s" - exit 1 - fi - done - log_success "Port-forward ready" - - # Detect if Keycloak auth is active - KEYCLOAK_ENABLED="false" - if [[ "${DEPLOY_KEYCLOAK:-false}" == "true" ]] || kubectl get svc keycloak -n osmo &>/dev/null; then - if has_envoy_sidecar osmo "app=osmo-service"; then - KEYCLOAK_ENABLED="true" - fi - fi - - if [[ "$KEYCLOAK_ENABLED" == "true" ]]; then - # Keycloak + Envoy mode: use the PATCH API via pod port-forward - log_info "Keycloak auth detected — creating service token via API..." - - TOKEN_NAME="backend-token-$(date -u +%Y%m%d%H%M%S)" - EXPIRY_DATE=$(date -u -d "+1 year" +%F 2>/dev/null || date -u -v+1y +%F 2>/dev/null || echo "2027-01-01") - - TOKEN_RESPONSE=$(osmo_curl POST \ - "http://localhost:8080/api/auth/access_token/service/${TOKEN_NAME}?expires_at=${EXPIRY_DATE}&roles=osmo-backend") - - # API returns the token as a plain JSON string (e.g. "abc123...") - OSMO_SERVICE_TOKEN=$(echo "$TOKEN_RESPONSE" | jq -r '. // empty' 2>/dev/null || echo "") - - if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then - log_error "Failed to create service token via API" - echo "Response: $TOKEN_RESPONSE" - exit 1 - fi - log_success "Service token created via API: $TOKEN_NAME" - else - # No Keycloak: use osmo CLI with dev login - log_info "Logging in to OSMO (dev method)..." - if ! osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then - log_error "Failed to login to OSMO" - exit 1 - fi - log_success "Logged in successfully" - - # Create service token - TOKEN_NAME="backend-token-$(date -u +%Y%m%d%H%M%S)" - EXPIRY_DATE=$(date -u -d "+1 year" +%F 2>/dev/null || date -u -v+1y +%F 2>/dev/null || echo "2027-01-01") - - log_info "Creating service token: $TOKEN_NAME (expires: $EXPIRY_DATE)..." - TOKEN_OUTPUT=$(osmo token set "$TOKEN_NAME" \ - --expires-at "$EXPIRY_DATE" \ - --description "Backend Operator Token (auto-generated)" \ - --service --roles osmo-backend 2>&1) - - # Extract token from output (format: "Access token: ") - OSMO_SERVICE_TOKEN=$(echo "$TOKEN_OUTPUT" | sed -n 's/.*Access token: //p' | tr -d '\r' | xargs) - - if [[ -z "$OSMO_SERVICE_TOKEN" ]]; then - log_error "Failed to create service token" - echo "Output: $TOKEN_OUTPUT" - exit 1 - fi - - log_success "Service token created successfully" - fi - - # Stop port-forward (we're done with it) - cleanup_port_forward - trap - EXIT - fi -fi - -# ----------------------------------------------------------------------------- -# Add OSMO Helm Repository -# ----------------------------------------------------------------------------- -log_info "Adding OSMO Helm repository..." -helm repo add osmo https://helm.ngc.nvidia.com/nvidia/osmo --force-update -helm repo update - -# ----------------------------------------------------------------------------- -# Create Namespaces -# ----------------------------------------------------------------------------- -log_info "Creating namespaces..." -kubectl create namespace "${OSMO_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - -kubectl create namespace "${OSMO_WORKFLOWS_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - - -# ----------------------------------------------------------------------------- -# Create Secrets -# ----------------------------------------------------------------------------- -log_info "Creating operator token secret..." -kubectl create secret generic osmo-operator-token \ - --namespace "${OSMO_OPERATOR_NAMESPACE}" \ - --from-literal=token="${OSMO_SERVICE_TOKEN}" \ - --dry-run=client -o yaml | kubectl apply -f - - -# ----------------------------------------------------------------------------- -# Create Values File -# ----------------------------------------------------------------------------- -log_info "Creating Helm values file..." - -# Note: services.backendListener/Worker are at root level, not under global -# See: osmo-helm-charts/backend-operator/values.yaml -cat > /tmp/backend_operator_values.yaml </dev/null || true - -echo "" -echo "========================================" -log_success "OSMO Backend Operator deployment complete!" -echo "========================================" -echo "" -echo "Backend Name: ${BACKEND_NAME}" -echo "Agent URL (WebSocket): ${OSMO_SERVICE_URL}" -echo "" -# Detect Ingress URL for verification instructions -INGRESS_URL=$(detect_service_url 2>/dev/null || true) - -echo "To verify the backend registration:" -echo "" -if [[ -n "$INGRESS_URL" ]]; then - echo " Check backend status:" - echo " osmo config show BACKEND ${BACKEND_NAME}" - echo "" - echo " Or via curl (using NGINX Ingress LoadBalancer):" - echo " curl ${INGRESS_URL}/api/configs/backend" -else - echo " Terminal 1 - Start port-forward (keep running):" - echo " kubectl port-forward -n osmo svc/osmo-service 8080:80" - echo "" - echo " Terminal 2 - Check backend status:" - echo " osmo config show BACKEND ${BACKEND_NAME}" - echo "" - echo " Or via curl:" - echo " curl http://localhost:8080/api/configs/backend" -fi -echo "" -echo "Next step - Configure Storage:" -echo " ./07-configure-storage.sh" -echo "" diff --git a/applications/osmo/deploy/002a-setup/07-configure-service-url.sh b/applications/osmo/deploy/002-setup/07-configure-service-url.sh similarity index 100% rename from applications/osmo/deploy/002a-setup/07-configure-service-url.sh rename to applications/osmo/deploy/002-setup/07-configure-service-url.sh diff --git a/applications/osmo/deploy/002-setup/07-configure-storage.sh b/applications/osmo/deploy/002-setup/07-configure-storage.sh deleted file mode 100755 index c677a5746..000000000 --- a/applications/osmo/deploy/002-setup/07-configure-storage.sh +++ /dev/null @@ -1,254 +0,0 @@ -#!/bin/bash -# -# Configure OSMO Storage -# https://nvidia.github.io/OSMO/main/deployment_guide/getting_started/configure_data_storage.html -# - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/lib/common.sh" -source "${SCRIPT_DIR}/defaults.sh" - -echo "" -echo "========================================" -echo " OSMO Storage Configuration" -echo "========================================" -echo "" - -# Check prerequisites -check_kubectl || exit 1 - -# ----------------------------------------------------------------------------- -# Get Storage Configuration from Terraform -# ----------------------------------------------------------------------------- -log_info "Retrieving storage configuration from Terraform..." - -S3_BUCKET=$(get_tf_output "storage_bucket.name" "../001-iac" 2>/dev/null || echo "") -S3_ENDPOINT=$(get_tf_output "storage_bucket.endpoint" "../001-iac" 2>/dev/null || echo "") - -# Default endpoint if not set -if [[ -z "$S3_ENDPOINT" ]]; then - S3_ENDPOINT="https://storage.eu-north1.nebius.cloud" -fi - -if [[ -z "$S3_BUCKET" ]]; then - log_error "Could not retrieve storage bucket name from Terraform" - echo "" - echo "Make sure you have run 'terraform apply' in deploy/001-iac" - echo "and that storage is enabled in your terraform.tfvars" - exit 1 -fi - -log_success "Storage bucket: ${S3_BUCKET}" -log_success "Storage endpoint: ${S3_ENDPOINT}" - -# ----------------------------------------------------------------------------- -# Check/Create osmo-storage secret -# ----------------------------------------------------------------------------- -log_info "Checking for osmo-storage secret..." - -if ! kubectl get secret osmo-storage -n osmo &>/dev/null; then - log_warning "osmo-storage secret not found - attempting to create from MysteryBox..." - - # Get credentials from Terraform/MysteryBox - S3_ACCESS_KEY=$(get_tf_output "storage_credentials.access_key_id" "../001-iac" 2>/dev/null || echo "") - S3_SECRET_REF_ID=$(get_tf_output "storage_secret_reference_id" "../001-iac" 2>/dev/null || echo "") - S3_SECRET_KEY="" - - if [[ -n "$S3_SECRET_REF_ID" ]]; then - log_info "Retrieving storage secret from MysteryBox..." - # IAM access key secrets are stored with key "secret" in MysteryBox - S3_SECRET_KEY=$(get_mysterybox_secret "$S3_SECRET_REF_ID" "secret" 2>/dev/null || echo "") - fi - - if [[ -z "$S3_ACCESS_KEY" || -z "$S3_SECRET_KEY" ]]; then - log_error "Could not retrieve storage credentials" - echo "" - echo "Either re-run 05-deploy-osmo-control-plane.sh or create the secret manually:" - echo "" - echo " kubectl create secret generic osmo-storage \\" - echo " --namespace osmo \\" - echo " --from-literal=access-key-id= \\" - echo " --from-literal=secret-access-key=" - exit 1 - fi - - # Create the secret - kubectl create secret generic osmo-storage \ - --namespace osmo \ - --from-literal=access-key-id="${S3_ACCESS_KEY}" \ - --from-literal=secret-access-key="${S3_SECRET_KEY}" \ - --dry-run=client -o yaml | kubectl apply -f - - - log_success "osmo-storage secret created" -else - log_success "osmo-storage secret exists" -fi - -# ----------------------------------------------------------------------------- -# Start port-forward and configure storage -# ----------------------------------------------------------------------------- -log_info "Starting port-forward to OSMO service..." - -# Start port-forward using shared helper (auto-detects Envoy) -start_osmo_port_forward osmo 8080 -export _OSMO_PORT=8080 - -# Cleanup function -cleanup_port_forward() { - if [[ -n "${PORT_FORWARD_PID:-}" ]]; then - kill $PORT_FORWARD_PID 2>/dev/null || true - wait $PORT_FORWARD_PID 2>/dev/null || true - fi -} -trap cleanup_port_forward EXIT - -# Wait for port-forward to be ready -log_info "Waiting for port-forward to be ready..." -max_wait=30 -elapsed=0 -while true; do - HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null || echo "000") - if [[ "$HTTP_CODE" == "200" ]]; then - break - fi - sleep 1 - ((elapsed += 1)) - if [[ $elapsed -ge $max_wait ]]; then - log_error "Port-forward failed to start within ${max_wait}s" - exit 1 - fi -done -log_success "Port-forward ready" - -# Login (no-op when bypassing Envoy) -log_info "Logging in to OSMO..." -if ! osmo_login 8080; then - log_error "Failed to login to OSMO" - exit 1 -fi -log_success "Logged in successfully" - -# ----------------------------------------------------------------------------- -# Get Storage Credentials -# ----------------------------------------------------------------------------- -log_info "Retrieving storage credentials..." - -# Get access key from Terraform -S3_ACCESS_KEY=$(get_tf_output "storage_credentials.access_key_id" "../001-iac" 2>/dev/null || echo "") - -# Get secret key from osmo-storage secret (already created) -S3_SECRET_KEY=$(kubectl get secret osmo-storage -n osmo -o jsonpath='{.data.secret-access-key}' 2>/dev/null | base64 -d 2>/dev/null || echo "") - -if [[ -z "$S3_ACCESS_KEY" || -z "$S3_SECRET_KEY" ]]; then - log_error "Could not retrieve storage credentials" - exit 1 -fi - -# Nebius Object Storage uses S3-compatible API -# OSMO uses TOS (Torch Object Storage) scheme for S3-compatible storage with custom endpoints -# Format: tos:/// -S3_HOST=$(echo "$S3_ENDPOINT" | sed 's|https://||') -BACKEND_URI="tos://${S3_HOST}/${S3_BUCKET}" -REGION="eu-north1" - -log_success "Storage credentials retrieved" - -# ----------------------------------------------------------------------------- -# Configure Workflow Log Storage in OSMO -# ----------------------------------------------------------------------------- -log_info "Configuring workflow log storage..." - -# Create workflow log config JSON -WORKFLOW_LOG_CONFIG=$(cat < /tmp/workflow_log_config.json - -if osmo_config_update WORKFLOW /tmp/workflow_log_config.json "Configure workflow log storage"; then - log_success "Workflow log storage configured" -else - log_error "Failed to configure workflow log storage" - rm -f /tmp/workflow_log_config.json - exit 1 -fi - -# ----------------------------------------------------------------------------- -# Configure Workflow Data Storage in OSMO -# ----------------------------------------------------------------------------- -log_info "Configuring workflow data storage..." - -# Create workflow data config JSON -WORKFLOW_DATA_CONFIG=$(cat < /tmp/workflow_data_config.json - -if osmo_config_update WORKFLOW /tmp/workflow_data_config.json "Configure workflow data storage"; then - log_success "Workflow data storage configured" -else - log_error "Failed to configure workflow data storage" - rm -f /tmp/workflow_log_config.json /tmp/workflow_data_config.json - exit 1 -fi - -# Cleanup temp files -rm -f /tmp/workflow_log_config.json /tmp/workflow_data_config.json - -# ----------------------------------------------------------------------------- -# Verify Configuration -# ----------------------------------------------------------------------------- -log_info "Verifying storage configuration..." - -echo "" -echo "Workflow configuration:" -osmo config show WORKFLOW 2>/dev/null || \ - osmo_curl GET "http://localhost:8080/api/configs/workflow" 2>/dev/null | jq '.' || \ - log_warning "Could not retrieve workflow config for verification" - -# Cleanup -cleanup_port_forward -trap - EXIT - -echo "" -echo "========================================" -log_success "OSMO Storage configuration complete!" -echo "========================================" -echo "" -echo "Storage Details:" -echo " Bucket: ${S3_BUCKET}" -echo " Endpoint: ${S3_ENDPOINT}" -echo " Backend URI: ${BACKEND_URI}" -echo " Region: ${REGION}" -echo "" -echo "Configured:" -echo " - workflow_log: For storing workflow logs" -echo " - workflow_data: For storing intermediate task data" -echo "" -echo "OSMO can now store workflow artifacts in Nebius Object Storage." -echo "" diff --git a/applications/osmo/deploy/002a-setup/08-configure-gpu-platform.sh b/applications/osmo/deploy/002-setup/08-configure-gpu-platform.sh similarity index 100% rename from applications/osmo/deploy/002a-setup/08-configure-gpu-platform.sh rename to applications/osmo/deploy/002-setup/08-configure-gpu-platform.sh diff --git a/applications/osmo/deploy/002-setup/08-configure-service-url.sh b/applications/osmo/deploy/002-setup/08-configure-service-url.sh deleted file mode 100755 index 3a0a24946..000000000 --- a/applications/osmo/deploy/002-setup/08-configure-service-url.sh +++ /dev/null @@ -1,154 +0,0 @@ -#!/bin/bash -# -# Configure OSMO Service URL -# Required for osmo-ctrl sidecar to communicate with OSMO service -# - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/lib/common.sh" -source "${SCRIPT_DIR}/defaults.sh" - -echo "" -echo "========================================" -echo " OSMO Service URL Configuration" -echo "========================================" -echo "" - -# Check prerequisites -check_kubectl || exit 1 - -# ----------------------------------------------------------------------------- -# Start port-forward -# ----------------------------------------------------------------------------- -log_info "Starting port-forward to OSMO service..." - -# Start port-forward using shared helper (auto-detects Envoy) -start_osmo_port_forward osmo 8080 -export _OSMO_PORT=8080 - -cleanup_port_forward() { - if [[ -n "${PORT_FORWARD_PID:-}" ]]; then - kill $PORT_FORWARD_PID 2>/dev/null || true - wait $PORT_FORWARD_PID 2>/dev/null || true - fi -} -trap cleanup_port_forward EXIT - -# Wait for port-forward to be ready -log_info "Waiting for port-forward to be ready..." -max_wait=30 -elapsed=0 -while true; do - HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null || echo "000") - if [[ "$HTTP_CODE" == "200" ]]; then - break - fi - sleep 1 - ((elapsed += 1)) - if [[ $elapsed -ge $max_wait ]]; then - log_error "Port-forward failed to start within ${max_wait}s" - exit 1 - fi -done -log_success "Port-forward ready" - -# Login (no-op when bypassing Envoy) -log_info "Logging in to OSMO..." -if ! osmo_login 8080; then - log_error "Failed to login to OSMO" - exit 1 -fi -log_success "Logged in successfully" - -# ----------------------------------------------------------------------------- -# Determine the target service URL -# ----------------------------------------------------------------------------- -log_info "Determining target service URL..." - -# Priority: -# 1. Explicit OSMO_INGRESS_BASE_URL (user override) -# 2. Auto-detect from NGINX Ingress Controller LoadBalancer -if [[ -n "${OSMO_INGRESS_BASE_URL:-}" ]]; then - SERVICE_URL="${OSMO_INGRESS_BASE_URL}" - log_info "Using explicit Ingress base URL: ${SERVICE_URL}" -elif DETECTED_URL=$(detect_service_url 2>/dev/null) && [[ -n "$DETECTED_URL" ]]; then - SERVICE_URL="${DETECTED_URL}" - log_info "Auto-detected service URL: ${SERVICE_URL}" -else - log_error "Could not detect NGINX Ingress Controller URL." - log_error "Ensure 03-deploy-nginx-ingress.sh was run and the LoadBalancer has an IP." - log_error "Or set OSMO_INGRESS_BASE_URL manually: export OSMO_INGRESS_BASE_URL=http://" - exit 1 -fi - -# ----------------------------------------------------------------------------- -# Check current service_base_url -# ----------------------------------------------------------------------------- -log_info "Checking current service_base_url..." - -CURRENT_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') -echo "Current service_base_url: '${CURRENT_URL}'" - -if [[ -n "$CURRENT_URL" && "$CURRENT_URL" != "null" && "$CURRENT_URL" == "$SERVICE_URL" ]]; then - log_success "service_base_url is already correctly configured: ${CURRENT_URL}" - cleanup_port_forward - trap - EXIT - exit 0 -elif [[ -n "$CURRENT_URL" && "$CURRENT_URL" != "null" ]]; then - log_warning "service_base_url is set to '${CURRENT_URL}' but should be '${SERVICE_URL}'" - log_info "Updating service_base_url..." -fi - -# ----------------------------------------------------------------------------- -# Configure service_base_url -# ----------------------------------------------------------------------------- -log_info "Configuring service_base_url to: ${SERVICE_URL}" - -cat > /tmp/service_url_fix.json << EOF -{ - "service_base_url": "${SERVICE_URL}" -} -EOF - -if osmo_config_update SERVICE /tmp/service_url_fix.json "Set service_base_url for osmo-ctrl sidecar"; then - log_success "service_base_url configured" -else - log_error "Failed to configure service_base_url" - rm -f /tmp/service_url_fix.json - exit 1 -fi - -rm -f /tmp/service_url_fix.json - -# ----------------------------------------------------------------------------- -# Verify Configuration -# ----------------------------------------------------------------------------- -log_info "Verifying configuration..." - -NEW_URL=$(osmo_curl GET "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') - -if [[ "$NEW_URL" == "$SERVICE_URL" ]]; then - log_success "service_base_url verified: ${NEW_URL}" -else - log_error "Verification failed. Expected: ${SERVICE_URL}, Got: ${NEW_URL}" - exit 1 -fi - -# Cleanup -cleanup_port_forward -trap - EXIT - -echo "" -echo "========================================" -log_success "OSMO Service URL configuration complete!" -echo "========================================" -echo "" -echo "Service URL: ${SERVICE_URL}" -echo "" -echo "This URL is used by the osmo-ctrl sidecar container to:" -echo " - Stream workflow logs to the OSMO service" -echo " - Report task status and completion" -echo " - Fetch authentication tokens" -echo "" diff --git a/applications/osmo/deploy/002-setup/09-configure-backend-scheduler.sh b/applications/osmo/deploy/002-setup/09-configure-backend-scheduler.sh new file mode 100755 index 000000000..27698e57b --- /dev/null +++ b/applications/osmo/deploy/002-setup/09-configure-backend-scheduler.sh @@ -0,0 +1,118 @@ +#!/bin/bash +# Configure BACKEND scheduler_settings (KAI scheduler + coscheduling) for Nebius OSMO. +# Run after 05-deploy-osmo-backend.sh once the backend is ONLINE. +# Option A: Patch existing backend (keeps router_address, etc.) – default. +# Option B: Apply config from config/scheduler-config.template.json (set ROUTER_ADDRESS, etc.). + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CONFIG_DIR="${SCRIPT_DIR}/config" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +OSMO_URL="${OSMO_URL:-http://localhost:8080}" +OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" +BACKEND_NAME="${OSMO_BACKEND_NAME:-default}" +K8S_NAMESPACE="${OSMO_WORKFLOWS_NAMESPACE:-osmo-workflows}" + +# Use template (Option B) if --from-template and template exists +USE_TEMPLATE=false +[[ "${1:-}" == "--from-template" ]] && USE_TEMPLATE=true + +echo "" +echo "========================================" +echo " Configure BACKEND scheduler (KAI + coscheduling)" +echo "========================================" +echo "" + +check_kubectl || exit 1 +command -v jq &>/dev/null || { log_error "jq is required"; exit 1; } + +# ----------------------------------------------------------------------------- +# Start port-forward and login +# ----------------------------------------------------------------------------- +log_info "Starting port-forward to OSMO service..." +start_osmo_port_forward "${OSMO_NAMESPACE}" 8080 + +cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill "$PORT_FORWARD_PID" 2>/dev/null || true + wait "$PORT_FORWARD_PID" 2>/dev/null || true + fi +} +trap cleanup_port_forward EXIT + +log_info "Waiting for port-forward..." +max_wait=30 +elapsed=0 +while ! curl -s -o /dev/null -w "%{http_code}" "${OSMO_URL}/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + ((elapsed += 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi +done +log_success "Port-forward ready" + +osmo_login 8080 || true + +# ----------------------------------------------------------------------------- +# Build backend config and apply +# ----------------------------------------------------------------------------- +if [[ "$USE_TEMPLATE" == "true" && -f "${CONFIG_DIR}/scheduler-config.template.json" ]]; then + # Option B: Render template and apply (set ROUTER_ADDRESS before running) + log_info "Using config from scheduler-config.template.json..." + if [[ -z "${ROUTER_ADDRESS:-}" ]]; then + # Derive from ingress: https://host -> wss://host + INGRESS_URL=$(detect_service_url 2>/dev/null || true) + if [[ -n "$INGRESS_URL" ]]; then + ROUTER_ADDRESS="wss://$(echo "$INGRESS_URL" | sed -e 's|https\?://||' -e 's|/.*||')" + log_info "Derived ROUTER_ADDRESS from ingress: ${ROUTER_ADDRESS}" + else + log_error "Set ROUTER_ADDRESS (e.g. wss://your-osmo-host) or run without --from-template to patch existing backend" + exit 1 + fi + fi + export BACKEND_NAME + export K8S_NAMESPACE + export ROUTER_ADDRESS + mkdir -p "${CONFIG_DIR}/out" + envsubst < "${CONFIG_DIR}/scheduler-config.template.json" > "${CONFIG_DIR}/out/scheduler-config.json" + BACKEND_FILE="${CONFIG_DIR}/out/scheduler-config.json" + if ! osmo config update BACKEND "$BACKEND_NAME" --file "$BACKEND_FILE" --description "Backend $BACKEND_NAME scheduler (KAI + coscheduling)"; then + log_error "Failed to apply backend config from template" + exit 1 + fi +else + # Option A: Patch existing backend (keep router_address and other fields) + log_info "Patching existing backend '$BACKEND_NAME' scheduler_settings (KAI + coscheduling)..." + BACKEND_JSON=$(osmo_curl GET "${OSMO_URL}/api/configs/backend" 2>/dev/null || true) + if [[ -z "$BACKEND_JSON" ]]; then + log_error "Could not get backend config. Is the backend registered? Run: osmo config show BACKEND" + exit 1 + fi + BACKEND_OBJECT=$(echo "$BACKEND_JSON" | jq -c --arg name "$BACKEND_NAME" \ + '.backends[] | select(.name == $name) | . + {scheduler_settings: {"scheduler_type":"kai","scheduler_name":"kai-scheduler","coscheduling":true,"scheduler_timeout":30}}') + if [[ -z "$BACKEND_OBJECT" || "$BACKEND_OBJECT" == "null" ]]; then + log_error "Backend '$BACKEND_NAME' not found in config. Available: $(echo "$BACKEND_JSON" | jq -r '.backends[].name' 2>/dev/null | tr '\n' ' ')" + exit 1 + fi + TMP_FILE=$(mktemp) + echo "$BACKEND_OBJECT" > "$TMP_FILE" + if ! osmo config update BACKEND "$BACKEND_NAME" --file "$TMP_FILE" --description "Backend $BACKEND_NAME scheduler (KAI + coscheduling)"; then + rm -f "$TMP_FILE" + log_error "Failed to update backend config" + exit 1 + fi + rm -f "$TMP_FILE" +fi + +log_success "BACKEND scheduler configuration applied" +echo "" +echo "Verify:" +echo " osmo config show BACKEND ${BACKEND_NAME}" +echo "" +echo "You should see scheduler_settings: scheduler_type=kai, coscheduling=true" +echo "" diff --git a/applications/osmo/deploy/002-setup/09-configure-gpu-platform.sh b/applications/osmo/deploy/002-setup/09-configure-gpu-platform.sh deleted file mode 100755 index 7d9268f17..000000000 --- a/applications/osmo/deploy/002-setup/09-configure-gpu-platform.sh +++ /dev/null @@ -1,167 +0,0 @@ -#!/bin/bash -# Configure OSMO GPU platform with tolerations via pod templates -# Based on OSMO documentation: https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/resource_pools.html - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/lib/common.sh" -source "${SCRIPT_DIR}/defaults.sh" - -OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" - -echo "" -echo "========================================" -echo " OSMO GPU Platform Configuration" -echo "========================================" -echo "" - -# Check prerequisites -check_kubectl || exit 1 - -# ----------------------------------------------------------------------------- -# Start port-forward (auto-detects Envoy and bypasses if needed) -# ----------------------------------------------------------------------------- -log_info "Starting port-forward to OSMO service..." -start_osmo_port_forward "${OSMO_NAMESPACE}" 8080 -export _OSMO_PORT=8080 - -cleanup_port_forward() { - if [[ -n "${PORT_FORWARD_PID:-}" ]]; then - kill $PORT_FORWARD_PID 2>/dev/null || true - wait $PORT_FORWARD_PID 2>/dev/null || true - fi -} -trap cleanup_port_forward EXIT - -OSMO_URL="http://localhost:8080" - -# Wait for port-forward to be ready (reject 302 — that means Envoy redirect, not direct) -log_info "Waiting for port-forward to be ready..." -max_wait=30 -elapsed=0 -while true; do - HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "${OSMO_URL}/api/version" 2>/dev/null || echo "000") - if [[ "$HTTP_CODE" == "200" ]]; then - break - fi - sleep 1 - ((elapsed += 1)) - if [[ $elapsed -ge $max_wait ]]; then - log_error "Port-forward failed to start within ${max_wait}s (last HTTP: ${HTTP_CODE})" - exit 1 - fi -done -log_success "Port-forward ready" - -# Login (no-op when bypassing Envoy) -osmo_login 8080 - -# ----------------------------------------------------------------------------- -# Step 0: Label nodes with OSMO pool/platform -# ----------------------------------------------------------------------------- -# OSMO discovers resources via node labels: -# osmo.nvidia.com/pool= — assigns node to a pool -# osmo.nvidia.com/platform= — assigns node to a platform within the pool -# GPU nodes get platform=gpu, CPU-only nodes get platform=default. -log_info "Labeling nodes with OSMO pool/platform..." - -NODE_COUNT=0 -GPU_NODE_COUNT=0 -for node in $(kubectl get nodes -o jsonpath='{.items[*].metadata.name}'); do - has_gpu=$(kubectl get node "$node" -o jsonpath='{.metadata.labels.nvidia\.com/gpu\.present}' 2>/dev/null) - gpu_count=$(kubectl get node "$node" -o jsonpath='{.status.allocatable.nvidia\.com/gpu}' 2>/dev/null) - - kubectl label node "$node" osmo.nvidia.com/pool=default --overwrite &>/dev/null - - if [[ "$has_gpu" == "true" ]] || [[ -n "$gpu_count" && "$gpu_count" -gt 0 ]] 2>/dev/null; then - kubectl label node "$node" osmo.nvidia.com/platform=gpu --overwrite &>/dev/null - ((GPU_NODE_COUNT++)) || true - else - kubectl label node "$node" osmo.nvidia.com/platform=default --overwrite &>/dev/null - fi - ((NODE_COUNT++)) || true -done - -log_success "Labeled ${NODE_COUNT} nodes (${GPU_NODE_COUNT} GPU, $((NODE_COUNT - GPU_NODE_COUNT)) CPU-only)" - -# Give the backend listener time to process node label changes -sleep 5 - -# ----------------------------------------------------------------------------- -# Step 1: Create GPU pod template -# ----------------------------------------------------------------------------- -log_info "Creating gpu_tolerations pod template..." - -RESPONSE=$(osmo_curl PUT "${OSMO_URL}/api/configs/pod_template/gpu_tolerations" \ - -d @"${SCRIPT_DIR}/gpu_pod_template.json" \ - -w "\n%{http_code}") -HTTP_CODE=$(echo "$RESPONSE" | tail -n1) -BODY=$(echo "$RESPONSE" | sed '$d') - -if [[ "$HTTP_CODE" -ge 200 && "$HTTP_CODE" -lt 300 ]]; then - log_success "Pod template created (HTTP ${HTTP_CODE})" -else - log_error "Failed to create pod template (HTTP ${HTTP_CODE})" - echo "Response: ${BODY}" - exit 1 -fi - -# ----------------------------------------------------------------------------- -# Step 2: Create GPU platform -# ----------------------------------------------------------------------------- -log_info "Creating gpu platform in default pool..." - -RESPONSE=$(osmo_curl PUT "${OSMO_URL}/api/configs/pool/default/platform/gpu" \ - -d @"${SCRIPT_DIR}/gpu_platform_update.json" \ - -w "\n%{http_code}") -HTTP_CODE=$(echo "$RESPONSE" | tail -n1) -BODY=$(echo "$RESPONSE" | sed '$d') - -if [[ "$HTTP_CODE" -ge 200 && "$HTTP_CODE" -lt 300 ]]; then - log_success "GPU platform created (HTTP ${HTTP_CODE})" -else - log_error "Failed to create GPU platform (HTTP ${HTTP_CODE})" - echo "Response: ${BODY}" - exit 1 -fi - -# ----------------------------------------------------------------------------- -# Step 3: Verify configuration -# ----------------------------------------------------------------------------- -log_info "Verifying configuration..." - -echo "" -echo "Pod templates:" -osmo_curl GET "${OSMO_URL}/api/configs/pod_template" | jq 'keys' - -echo "" -echo "GPU platform config:" -osmo_curl GET "${OSMO_URL}/api/configs/pool/default" | jq '.platforms.gpu' - -# ----------------------------------------------------------------------------- -# Step 4: Check GPU resources -# ----------------------------------------------------------------------------- -log_info "Checking GPU resources..." -sleep 3 # Wait for backend to pick up changes - -RESOURCE_COUNT=$(osmo_curl GET "${OSMO_URL}/api/resources" | jq '[.resources[] | select(.allocatable_fields.gpu != null)] | length') -echo "GPU nodes visible to OSMO: ${RESOURCE_COUNT}" - -if [[ "$RESOURCE_COUNT" -gt 0 ]]; then - echo "" - echo "GPU resources:" - osmo_curl GET "${OSMO_URL}/api/resources" | jq '.resources[] | select(.allocatable_fields.gpu != null) | {name: .name, gpu: .allocatable_fields.gpu, cpu: .allocatable_fields.cpu, memory: .allocatable_fields.memory}' -fi - -# ----------------------------------------------------------------------------- -# Done -# ----------------------------------------------------------------------------- -log_success "GPU platform configuration complete" -echo "" -echo "To submit a GPU workflow:" -echo " osmo workflow submit workflows/osmo/gpu_test.yaml -p default" -echo "" -echo "Or test via curl:" -echo " curl -X POST ${OSMO_URL}/api/workflow -H 'Content-Type: application/yaml' --data-binary @workflows/osmo/gpu_test.yaml" -echo "" diff --git a/applications/osmo/deploy/002-setup/10-configure-dataset-bucket.sh b/applications/osmo/deploy/002-setup/10-configure-dataset-bucket.sh new file mode 100755 index 000000000..5db244d22 --- /dev/null +++ b/applications/osmo/deploy/002-setup/10-configure-dataset-bucket.sh @@ -0,0 +1,249 @@ +#!/bin/bash +# +# Register the Nebius storage bucket as an OSMO dataset bucket. +# This allows using the bucket for OSMO datasets (e.g. osmo dataset upload/list) +# with a short name (e.g. nebius/my-dataset) instead of full URIs. +# +# Requires: 06-configure-storage.sh (port-forward and workflow storage) and +# OSMO control plane running. Uses the same bucket and credentials as workflow storage. +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " OSMO Dataset Bucket Configuration" +echo "========================================" +echo "" + +# Optional: name for the bucket in OSMO (default: nebius) +DATASET_BUCKET_NAME="${DATASET_BUCKET_NAME:-nebius}" + +# Check prerequisites +check_kubectl || exit 1 + +# ----------------------------------------------------------------------------- +# Select Nebius Region +# ----------------------------------------------------------------------------- +VALID_REGIONS=("eu-north1" "me-west1") + +if [[ -n "${NEBIUS_REGION:-}" ]]; then + REGION="$NEBIUS_REGION" + matched=false + for r in "${VALID_REGIONS[@]}"; do + [[ "$r" == "$REGION" ]] && matched=true && break + done + if ! $matched; then + log_error "Invalid NEBIUS_REGION '${REGION}'. Valid options: ${VALID_REGIONS[*]}" + exit 1 + fi + log_info "Using region from NEBIUS_REGION: ${REGION}" +else + echo "Select the Nebius region for the storage bucket:" + echo "" + _idx=1 + for _r in "${VALID_REGIONS[@]}"; do + echo " ${_idx}) ${_r}" + _idx=$((_idx + 1)) + done + echo "" + while true; do + printf "Enter choice [1-${#VALID_REGIONS[@]}]: " + read -r choice + if [[ "$choice" =~ ^[0-9]+$ ]] && (( choice >= 1 && choice <= ${#VALID_REGIONS[@]} )); then + REGION="${VALID_REGIONS[$choice]}" + # bash arrays are 0-based, zsh arrays are 1-based; adjust if needed + if [[ -z "$REGION" ]]; then + REGION="${VALID_REGIONS[$((choice - 1))]}" + fi + break + fi + echo "Invalid selection. Please enter a number between 1 and ${#VALID_REGIONS[@]}." + done + log_info "Selected region: ${REGION}" +fi + +S3_REGION_FOR_BOTO="${REGION}" + +# ----------------------------------------------------------------------------- +# Get Storage Configuration from Terraform +# ----------------------------------------------------------------------------- +log_info "Retrieving storage configuration from Terraform..." + +S3_BUCKET=$(get_tf_output "storage_bucket.name" "../001-iac" 2>/dev/null || echo "") +S3_ENDPOINT=$(get_tf_output "storage_bucket.endpoint" "../001-iac" 2>/dev/null || echo "") + +if [[ -z "$S3_ENDPOINT" ]]; then + S3_ENDPOINT="https://storage.${REGION}.nebius.cloud" +fi + +if [[ -z "$S3_BUCKET" ]]; then + log_error "Could not retrieve storage bucket name from Terraform" + echo "" + echo "Run 'terraform apply' in deploy/001-iac and ensure storage is enabled." + exit 1 +fi + +# Datasets are stored under the osmo-datasets prefix within the bucket. +# The path uses the standard s3:/// format; the actual endpoint +# is configured separately via AWS_ENDPOINT_URL_S3 in the Helm chart / pod template. +DATASET_PATH="s3://${S3_BUCKET}/osmo-datasets" + +# ----------------------------------------------------------------------------- +# Get storage credentials (for default_credential on the dataset bucket) +# ----------------------------------------------------------------------------- +log_info "Retrieving storage credentials for default_credential..." + +S3_ACCESS_KEY=$(get_tf_output "storage_credentials.access_key_id" "../001-iac" 2>/dev/null || echo "") +S3_SECRET_KEY=$(kubectl get secret osmo-storage -n "${OSMO_NAMESPACE:-osmo}" -o jsonpath='{.data.secret-access-key}' 2>/dev/null | base64 -d 2>/dev/null || echo "") + +if [[ -z "$S3_ACCESS_KEY" ]]; then + log_warning "Could not get access key from Terraform; bucket will have no default_credential" +fi +if [[ -z "$S3_SECRET_KEY" ]]; then + S3_SECRET_REF_ID=$(get_tf_output "storage_secret_reference_id" "../001-iac" 2>/dev/null || echo "") + if [[ -n "$S3_SECRET_REF_ID" ]]; then + S3_SECRET_KEY=$(get_mysterybox_secret "$S3_SECRET_REF_ID" "secret" 2>/dev/null || echo "") + fi +fi + +if [[ -n "$S3_ACCESS_KEY" && -n "$S3_SECRET_KEY" ]]; then + log_success "Storage credentials retrieved (default_credential will be set)" +else + log_warning "Missing credentials; registering bucket without default_credential (users must supply credentials)" +fi + +log_success "Bucket: ${S3_BUCKET}" +log_success "Dataset path: ${DATASET_PATH}" +log_success "Region: ${REGION}" +log_success "S3 endpoint: ${S3_ENDPOINT}" +log_success "OSMO bucket name: ${DATASET_BUCKET_NAME}" + +# ----------------------------------------------------------------------------- +# Start port-forward and configure dataset bucket +# ----------------------------------------------------------------------------- +log_info "Starting port-forward to OSMO service..." + +OSMO_NS="${OSMO_NAMESPACE:-osmo}" +start_osmo_port_forward "${OSMO_NS}" 8080 + +cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill $PORT_FORWARD_PID 2>/dev/null || true + wait $PORT_FORWARD_PID 2>/dev/null || true + fi +} +trap cleanup_port_forward EXIT + +log_info "Waiting for port-forward..." +max_wait=30 +elapsed=0 +while ! curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + elapsed=$((elapsed + 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + exit 1 + fi +done +log_success "Port-forward ready" + +osmo_login 8080 || exit 1 + +# ----------------------------------------------------------------------------- +# Build dataset config: add/update Nebius bucket and set as default bucket +# See: https://nvidia.github.io/OSMO/main/deployment_guide/advanced_config/dataset_buckets.html +# ----------------------------------------------------------------------------- +log_info "Building DATASET config (bucket + default_bucket)..." + +# Build bucket config object (with optional default_credential) +# PATCH API accepts only access_key_id and access_key in default_credential; +# endpoint/region are taken from the bucket at runtime. +BUCKET_JSON="/tmp/osmo_dataset_bucket_obj.json" +if [[ -n "$S3_ACCESS_KEY" && -n "$S3_SECRET_KEY" ]]; then + jq -n \ + --arg path "$DATASET_PATH" \ + --arg region "$S3_REGION_FOR_BOTO" \ + --arg akid "$S3_ACCESS_KEY" \ + --arg ak "$S3_SECRET_KEY" \ + '{ + dataset_path: $path, + region: $region, + description: "Nebius Object Storage bucket", + mode: "read-write", + default_credential: { + access_key_id: $akid, + access_key: $ak + } + }' > "${BUCKET_JSON}" +else + jq -n \ + --arg path "$DATASET_PATH" \ + --arg region "$S3_REGION_FOR_BOTO" \ + '{ + dataset_path: $path, + region: $region, + description: "Nebius Object Storage bucket", + mode: "read-write" + }' > "${BUCKET_JSON}" +fi + +# Fetch current dataset config so we can merge (preserve other buckets if any) +CURRENT_DATASET="/tmp/osmo_dataset_current.json" +if osmo_curl GET "http://localhost:8080/api/configs/dataset" 2>/dev/null | jq -r '.configs_dict // . | if type == "object" then . else empty end' > "${CURRENT_DATASET}" 2>/dev/null && [[ -s "${CURRENT_DATASET}" ]]; then + # Merge: add/overwrite our bucket and set default_bucket (users can omit bucket prefix) + UPDATED_DATASET="/tmp/osmo_dataset_updated.json" + jq --arg name "$DATASET_BUCKET_NAME" \ + --slurpfile bucket "${BUCKET_JSON}" \ + '.buckets[$name] = $bucket[0] | .default_bucket = $name' \ + "${CURRENT_DATASET}" > "${UPDATED_DATASET}" +else + # No existing config: create new with single bucket and set as default_bucket + UPDATED_DATASET="/tmp/osmo_dataset_updated.json" + jq -n --arg name "$DATASET_BUCKET_NAME" \ + --slurpfile bucket "${BUCKET_JSON}" \ + '{ buckets: { ($name): $bucket[0] }, default_bucket: $name }' \ + > "${UPDATED_DATASET}" +fi + +if osmo_config_update DATASET "${UPDATED_DATASET}" "Register Nebius bucket and set as default dataset bucket"; then + log_success "Dataset bucket configured and set as default" +else + log_error "Failed to configure dataset bucket" + rm -f "${BUCKET_JSON}" "${CURRENT_DATASET}" "${UPDATED_DATASET}" + exit 1 +fi + +rm -f "${BUCKET_JSON}" "${CURRENT_DATASET}" "${UPDATED_DATASET}" + +# ----------------------------------------------------------------------------- +# Verify +# ----------------------------------------------------------------------------- +log_info "Verifying..." +echo "" +osmo_curl GET "http://localhost:8080/api/configs/dataset" 2>/dev/null | jq '.configs_dict // .' || true + +cleanup_port_forward +trap - EXIT + +echo "" +echo "========================================" +log_success "OSMO dataset bucket configuration complete!" +echo "========================================" +echo "" +echo "Bucket '${DATASET_BUCKET_NAME}' is registered and set as the default bucket." +echo " dataset_path: ${DATASET_PATH}" +echo " default_bucket: ${DATASET_BUCKET_NAME}" +echo "" +echo "With default_bucket set, you can reference datasets without the bucket prefix:" +echo " my-dataset:latest (instead of ${DATASET_BUCKET_NAME}/my-dataset:latest)" +echo "" +echo "Usage:" +echo " osmo profile set bucket ${DATASET_BUCKET_NAME}" +echo " osmo bucket list" +echo " osmo dataset upload my-dataset:latest ./data" +echo "" diff --git a/applications/osmo/deploy/002-setup/README.md b/applications/osmo/deploy/002-setup/README.md index 0cde10d5a..05ec8b55c 100755 --- a/applications/osmo/deploy/002-setup/README.md +++ b/applications/osmo/deploy/002-setup/README.md @@ -24,20 +24,17 @@ Run scripts in order: # 3. NGINX Ingress Controller (required – provides routing for OSMO services) ./03-deploy-nginx-ingress.sh -# 4. Enable TLS (optional, recommended – set up DNS A record first) -./04-enable-tls.sh +# 4. OSMO Control Plane +./04-deploy-osmo-control-plane.sh -# 5. OSMO Control Plane -./05-deploy-osmo-control-plane.sh +# 5. OSMO Backend +./05-deploy-osmo-backend.sh -# 6. OSMO Backend -./06-deploy-osmo-backend.sh +# 6. Configure Storage (requires port-forward, see main README) +./06-configure-storage.sh -# 7. Configure Storage (requires port-forward, see main README) -./07-configure-storage.sh - -# 8. Configure GPU Platform (required for GPU workflows) -./09-configure-gpu-platform.sh +# 7. Configure GPU Platform (required for GPU workflows) +./08-configure-gpu-platform.sh ``` ## Scripts @@ -47,12 +44,11 @@ Run scripts in order: | `01-deploy-gpu-infrastructure.sh` | GPU Operator, Network Operator, KAI Scheduler | ~15 min | | `02-deploy-observability.sh` | Prometheus, Grafana, Loki, Promtail | ~10 min | | `03-deploy-nginx-ingress.sh` | NGINX Ingress Controller (routing for OSMO services) | ~2 min | -| `04-enable-tls.sh` | TLS/HTTPS via cert-manager + Let's Encrypt (optional, recommended) | ~2 min | -| `05-deploy-osmo-control-plane.sh` | OSMO Control Plane, Ingress resources, database secrets, service URL | ~5 min | -| `06-deploy-osmo-backend.sh` | OSMO Backend operator | ~5 min | -| `07-configure-storage.sh` | Configure S3-compatible storage for workflow logs/data | ~1 min | -| `08-configure-service-url.sh` | Reconfigure service URL manually (usually not needed) | ~1 min | -| `09-configure-gpu-platform.sh` | Configure GPU platform with tolerations/node selector | ~1 min | +| `04-deploy-osmo-control-plane.sh` | OSMO Control Plane, Ingress resources, database secrets, service URL | ~5 min | +| `05-deploy-osmo-backend.sh` | OSMO Backend operator | ~5 min | +| `06-configure-storage.sh` | Configure S3-compatible storage for workflow logs/data | ~1 min | +| `07-configure-service-url.sh` | Reconfigure service URL manually (usually not needed) | ~1 min | +| `08-configure-gpu-platform.sh` | Configure GPU platform with tolerations/node selector | ~1 min | ## Configuration @@ -83,7 +79,7 @@ OSMO_NAMESPACE="osmo" # Grafana password (auto-generated if empty) GRAFANA_ADMIN_PASSWORD="" -# NGINX Ingress (deploy 03-deploy-nginx-ingress.sh before 05-deploy-osmo-control-plane.sh) +# NGINX Ingress (deploy 03-deploy-nginx-ingress.sh before 04-deploy-osmo-control-plane.sh) OSMO_INGRESS_HOSTNAME="" # hostname for Ingress rules (e.g. osmo.example.com); leave empty for IP-based access OSMO_INGRESS_BASE_URL="" # override for service_base_url; auto-detected from LoadBalancer if empty ``` @@ -97,7 +93,7 @@ If you ran `secrets-init.sh` in the prerequisites step, the following environmen | `TF_VAR_postgresql_mysterybox_secret_id` | MysteryBox secret ID for PostgreSQL password | | `TF_VAR_mek_mysterybox_secret_id` | MysteryBox secret ID for MEK (Master Encryption Key) | -The `05-deploy-osmo-control-plane.sh` script automatically reads these secrets from MysteryBox. This keeps sensitive credentials out of Terraform state and provides a secure secrets management workflow. +The `04-deploy-osmo-control-plane.sh` script automatically reads these secrets from MysteryBox. This keeps sensitive credentials out of Terraform state and provides a secure secrets management workflow. **Secret retrieval order:** 1. **MysteryBox** (if secret ID is set via `TF_VAR_*` or `OSMO_*` env vars) @@ -182,7 +178,7 @@ Nebius GPU nodes have a taint `nvidia.com/gpu=true:NoSchedule` that prevents pod ### Option 1: Run the Configuration Script (Recommended) ```bash -./09-configure-gpu-platform.sh +./08-configure-gpu-platform.sh ``` ### Option 2: Manual Configuration via API @@ -358,7 +354,7 @@ If OSMO shows 0 GPUs or GPU workflows fail to schedule: 4. If missing, run the GPU configuration: ```bash - ./09-configure-gpu-platform.sh + ./08-configure-gpu-platform.sh ``` 5. Verify OSMO sees GPU resources: diff --git a/applications/osmo/deploy/002-setup/defaults.sh b/applications/osmo/deploy/002-setup/defaults.sh index 7938b3d4b..2c9f3feca 100755 --- a/applications/osmo/deploy/002-setup/defaults.sh +++ b/applications/osmo/deploy/002-setup/defaults.sh @@ -23,11 +23,7 @@ export TOOLKIT_ENABLED="true" export DEVICE_PLUGIN_ENABLED="true" export MIG_MANAGER_ENABLED="false" -# Driverfull images (Nebius pre-installed NVIDIA drivers, skips GPU Operator driver) -# Recommended for B200/B300 GPUs where the GPU Operator's bundled driver may not support NVSwitch. -export USE_DRIVERFULL_IMAGES="${USE_DRIVERFULL_IMAGES:-}" # Auto-detected from Terraform; set "true"/"false" to override - -# Network Operator (only needed for InfiniBand/GPU clusters without driverfull images) +# Network Operator (only needed for InfiniBand/GPU clusters) export ENABLE_NETWORK_OPERATOR="false" # Set to "true" if using InfiniBand # Observability settings @@ -43,16 +39,34 @@ export OSMO_INGRESS_HOSTNAME="${OSMO_INGRESS_HOSTNAME:-}" # Override for the service_base_url used by osmo-ctrl. Auto-detected from the ingress LoadBalancer if empty. export OSMO_INGRESS_BASE_URL="${OSMO_INGRESS_BASE_URL:-}" +# TLS / SSL Configuration +# TLS enabled by default. Requires OSMO_INGRESS_HOSTNAME to be set. Set to false to disable. +export OSMO_TLS_ENABLED="${OSMO_TLS_ENABLED:-true}" +# Name of the Kubernetes TLS secret used by Ingress (both paths produce this secret). +# NOTE: The OSMO Helm chart generates ingress TLS with secretName "osmo-tls". +export OSMO_TLS_SECRET_NAME="${OSMO_TLS_SECRET_NAME:-osmo-tls}" +# Local directory where certbot stores certificate files (Path A only). +export OSMO_TLS_CERT_DIR="${OSMO_TLS_CERT_DIR:-$HOME/.osmo-certs}" +# Email for Let's Encrypt registration (required for 03a and 03c). +export LETSENCRYPT_EMAIL="${LETSENCRYPT_EMAIL:-}" +# cert-manager namespace (Path B / 03c only). +export CERT_MANAGER_NAMESPACE="${CERT_MANAGER_NAMESPACE:-cert-manager}" +# Name of the ClusterIssuer created by 03c (Path B only). +export CLUSTER_ISSUER_NAME="${CLUSTER_ISSUER_NAME:-letsencrypt-prod}" +# TLS mode: "certbot" or "cert-manager". Set automatically by 03a/03c. +export OSMO_TLS_MODE="${OSMO_TLS_MODE:-}" + # Keycloak / Authentication -# Set DEPLOY_KEYCLOAK=true to deploy Keycloak and enable OSMO auth with Envoy sidecars. -export DEPLOY_KEYCLOAK="${DEPLOY_KEYCLOAK:-false}" -# Keycloak hostname (e.g. auth.osmo.example.com). -# Auto-derived from OSMO_INGRESS_HOSTNAME if empty: auth.. +# Keycloak deployed by default. Requires OSMO_INGRESS_HOSTNAME or KEYCLOAK_HOSTNAME. Set to false to disable. +export DEPLOY_KEYCLOAK="${DEPLOY_KEYCLOAK:-true}" +# Keycloak hostname (e.g. auth-osmo-nebius.csptst.nvidia.com). +# Auto-derived from OSMO_INGRESS_HOSTNAME if empty: auth-. export KEYCLOAK_HOSTNAME="${KEYCLOAK_HOSTNAME:-}" # TLS secret name for the Keycloak ingress (separate from the main osmo-tls). +# Run 03a with OSMO_TLS_SECRET_NAME=osmo-tls-auth for the auth subdomain. export KEYCLOAK_TLS_SECRET_NAME="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" # Paths -export SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +export SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" export VALUES_DIR="${SCRIPT_DIR}/values" export LIB_DIR="${SCRIPT_DIR}/lib" diff --git a/applications/osmo/deploy/002-setup/gpu_pod_template.json b/applications/osmo/deploy/002-setup/gpu_pod_template.json index ae651e3ba..107207028 100755 --- a/applications/osmo/deploy/002-setup/gpu_pod_template.json +++ b/applications/osmo/deploy/002-setup/gpu_pod_template.json @@ -8,9 +8,62 @@ "effect": "NoSchedule" } ], + "containers": [ + { + "name": "{{USER_CONTAINER_NAME}}", + "env": [ + { + "name": "AWS_ENDPOINT_URL_S3", + "value": "https://storage.me-west1.nebius.cloud:443" + }, + { + "name": "AWS_S3_FORCE_PATH_STYLE", + "value": "true" + }, + { + "name": "AWS_DEFAULT_REGION", + "value": "us-east-1" + }, + { + "name": "OSMO_LOGIN_DEV", + "value": "true" + }, + { + "name": "OSMO_SKIP_DATA_AUTH", + "value": "1" + } + ] + }, + { + "name": "osmo-ctrl", + "env": [ + { + "name": "AWS_ENDPOINT_URL_S3", + "value": "https://storage.me-west1.nebius.cloud:443" + }, + { + "name": "AWS_S3_FORCE_PATH_STYLE", + "value": "true" + }, + { + "name": "AWS_DEFAULT_REGION", + "value": "us-east-1" + }, + { + "name": "OSMO_LOGIN_DEV", + "value": "true" + }, + { + "name": "OSMO_SKIP_DATA_AUTH", + "value": "1" + } + ] + } + ], "nodeSelector": { "nvidia.com/gpu.present": "true" } } - } + }, + "description": "Add compute pod template" } diff --git a/applications/osmo/deploy/002-setup/lib/common.sh b/applications/osmo/deploy/002-setup/lib/common.sh index 86baa93bc..87abfb573 100755 --- a/applications/osmo/deploy/002-setup/lib/common.sh +++ b/applications/osmo/deploy/002-setup/lib/common.sh @@ -27,9 +27,6 @@ log_error() { echo -e "${RED}[✗]${NC} $1" } -# Pause on error so the user can read the output before the terminal closes -trap '_exit_code=$?; if [[ $_exit_code -ne 0 ]]; then echo ""; log_error "Script failed (exit code $_exit_code). Press Enter to close..."; read -r; fi' EXIT - # Read input with a prompt into a variable (bash/zsh compatible). read_prompt_var() { local prompt=$1 @@ -202,8 +199,8 @@ wait_for_pods() { # Detect OSMO service URL from the NGINX Ingress Controller's LoadBalancer. # -# When TLS_ENABLED=true and OSMO_INGRESS_HOSTNAME is set, returns https://. -# Otherwise falls back to http://. +# When OSMO_TLS_ENABLED=true and OSMO_INGRESS_HOSTNAME is set, returns +# https://. Otherwise falls back to http://. # # Lookup order: # 0. If TLS enabled + hostname set, return https:// immediately @@ -216,12 +213,13 @@ wait_for_pods() { # [[ -n "$url" ]] && echo "OSMO reachable at $url" detect_service_url() { local ns="${INGRESS_NAMESPACE:-ingress-nginx}" - local tls_enabled="${TLS_ENABLED:-false}" + local tls_enabled="${OSMO_TLS_ENABLED:-false}" local hostname="${OSMO_INGRESS_HOSTNAME:-}" local scheme="http" if [[ "$tls_enabled" == "true" ]]; then scheme="https" + # If hostname is configured, prefer it (TLS certs are issued for the domain) if [[ -n "$hostname" ]]; then echo "${scheme}://${hostname}" return 0 @@ -264,109 +262,6 @@ detect_service_url() { return 1 } -# --------------------------------------------------------------------------- -# Envoy-aware helpers -# When Envoy sidecar is present, these helpers port-forward directly to the -# OSMO pod on port 8000 (bypassing Envoy) and inject x-osmo-user / x-osmo-roles -# headers so the API recognises the caller as an admin. -# --------------------------------------------------------------------------- - -# Check whether a pod matching a label selector has an "envoy" container. -# Usage: has_envoy_sidecar -has_envoy_sidecar() { - local ns="${1:-osmo}" - # The OSMO Helm chart uses the label "app=osmo-service" (not app.kubernetes.io/name) - local label="${2:-app=osmo-service}" - kubectl get pods -n "$ns" -l "$label" -o jsonpath='{.items[0].spec.containers[*].name}' 2>/dev/null | grep -qw envoy -} - -# Port-forward to the OSMO service. -# When Envoy is present, forwards to the first matching *pod* on port 8000 -# (direct access, no auth). Otherwise forwards to svc/osmo-service:80. -# Sets PORT_FORWARD_PID and _OSMO_AUTH_BYPASS. -# Usage: start_osmo_port_forward -start_osmo_port_forward() { - local ns="${1:-osmo}" - local local_port="${2:-8080}" - - # Kill any stale port-forward on the target port (e.g. from a previous sourced run) - if command -v lsof &>/dev/null && lsof -ti :"$local_port" &>/dev/null; then - log_warning "Port ${local_port} already in use — killing stale process" - kill $(lsof -ti :"$local_port") 2>/dev/null || true - sleep 1 - fi - - if has_envoy_sidecar "$ns" "app=osmo-service"; then - log_info "Envoy sidecar detected — port-forwarding to pod:8000 (bypass Envoy)" - local pod_name - pod_name=$(kubectl get pods -n "$ns" -l app=osmo-service \ - -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - kubectl port-forward -n "$ns" "pod/${pod_name}" "${local_port}:8000" &>/dev/null & - PORT_FORWARD_PID=$! - _OSMO_AUTH_BYPASS="true" - else - log_info "No Envoy sidecar — port-forwarding to svc/osmo-service:80" - kubectl port-forward -n "$ns" svc/osmo-service "${local_port}:80" &>/dev/null & - PORT_FORWARD_PID=$! - _OSMO_AUTH_BYPASS="false" - fi - export PORT_FORWARD_PID _OSMO_AUTH_BYPASS -} - -# Wrapper around curl that injects admin headers when bypassing Envoy. -# Usage: osmo_curl [extra_curl_args...] -osmo_curl() { - local method="$1"; shift - local url="$1"; shift - local extra_args=("$@") - - if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then - curl -s -X "$method" "$url" \ - -H "x-osmo-user: osmo-admin" \ - -H "x-osmo-roles: osmo-admin,osmo-user" \ - -H "Content-Type: application/json" \ - "${extra_args[@]}" - else - curl -s -X "$method" "$url" \ - -H "Content-Type: application/json" \ - "${extra_args[@]}" - fi -} - -# Login to OSMO via the CLI. No-op when bypassing Envoy (headers handle auth). -# Usage: osmo_login -osmo_login() { - local port="${1:-8080}" - if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then - log_info "Auth bypass active — skipping osmo login" - return 0 - fi - osmo login "http://localhost:${port}" --method dev --username admin 2>/dev/null -} - -# Update an OSMO config. -# When bypassing Envoy, uses the PATCH API with configs_dict wrapper -# (avoids the "osmo config update" CLI which may not work without a real session). -# Otherwise delegates to `osmo config update`. -# Usage: osmo_config_update -osmo_config_update() { - local config_type="$1" - local json_file="$2" - local description="${3:-Update config}" - local port="${_OSMO_PORT:-8080}" - local config_type_lower - config_type_lower=$(echo "$config_type" | tr '[:upper:]' '[:lower:]') - - if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then - # Wrap the JSON in configs_dict and PATCH directly - local payload - payload=$(jq -n --argjson data "$(cat "$json_file")" '{"configs_dict": $data}') - osmo_curl PATCH "http://localhost:${port}/api/configs/${config_type_lower}" -d "$payload" - else - osmo config update "$config_type" --file "$json_file" --description "$description" 2>/dev/null - fi -} - # Get Terraform output (supports nested values like "postgresql.host") get_tf_output() { local name=$1 @@ -417,3 +312,123 @@ get_mysterybox_secret() { echo "$result" | jq -r '.data.string_value // empty' 2>/dev/null fi } + +# ----------------------------------------------------------------------------- +# OSMO API helpers (for use when Envoy auth sidecar is present) +# ----------------------------------------------------------------------------- +# Per OSMO documentation, the OSMO service authorises requests by reading +# the x-osmo-user and x-osmo-roles headers. Envoy normally sets these from +# the JWT but when we bypass Envoy (port-forward to pod:8000) we must set +# them ourselves. +# +# Reference: https://nvidia.github.io/OSMO/main/deployment_guide/appendix/authentication/authentication_flow.html + +# Detect if a pod has an Envoy sidecar container +# Usage: has_envoy_sidecar +# Returns 0 (true) if envoy container is found, 1 (false) otherwise +has_envoy_sidecar() { + local ns="$1" + local label="$2" + local pod_name + pod_name=$(kubectl get pod -n "$ns" -l "$label" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) + if [[ -z "$pod_name" ]]; then + return 1 + fi + kubectl get pod -n "$ns" "$pod_name" -o jsonpath='{.spec.containers[*].name}' 2>/dev/null | grep -q envoy +} + +# Start a port-forward that bypasses Envoy when the sidecar is present. +# Sets PORT_FORWARD_PID and prints log messages. +# Usage: start_osmo_port_forward [local_port] +start_osmo_port_forward() { + local ns="${1:-osmo}" + local local_port="${2:-8080}" + + if has_envoy_sidecar "$ns" "app=osmo-service"; then + local pod_name + pod_name=$(kubectl get pod -n "$ns" -l app=osmo-service -o jsonpath='{.items[0].metadata.name}') + log_info "Envoy sidecar detected -- port-forwarding to pod/${pod_name}:8000 (bypassing auth)..." + kubectl port-forward -n "$ns" "pod/${pod_name}" "${local_port}:8000" &>/dev/null & + _OSMO_AUTH_BYPASS=true + else + log_info "No Envoy sidecar -- port-forwarding to svc/osmo-service:80..." + kubectl port-forward -n "$ns" svc/osmo-service "${local_port}:80" &>/dev/null & + _OSMO_AUTH_BYPASS=false + fi + PORT_FORWARD_PID=$! + export _OSMO_AUTH_BYPASS +} + +# Make an authenticated curl call to the OSMO API. +# When _OSMO_AUTH_BYPASS=true (Envoy bypassed), injects x-osmo-user and +# x-osmo-roles headers so the OSMO service authorises the request. +# Usage: osmo_curl [curl-args...] +# Example: osmo_curl GET "http://localhost:8080/api/configs/service" +# Example: osmo_curl PATCH "http://localhost:8080/api/configs/service" -d '{"configs_dict":{...}}' +osmo_curl() { + local method="$1"; shift + local url="$1"; shift + + local auth_args=() + if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then + auth_args+=(-H "x-osmo-user: osmo-admin" -H "x-osmo-roles: osmo-admin,osmo-user") + fi + + curl -s -X "$method" "$url" \ + -H "Content-Type: application/json" \ + "${auth_args[@]}" \ + "$@" +} + +# Log in to OSMO using the appropriate method. +# When bypassing Envoy this is a no-op (curl headers handle auth). +# Otherwise uses `osmo login --method dev`. +# Usage: osmo_login [port] +osmo_login() { + local port="${1:-8080}" + if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then + log_info "Auth bypass active -- using direct API headers (osmo-admin role)" + else + log_info "Logging in to OSMO..." + if ! osmo login "http://localhost:${port}" --method dev --username admin 2>/dev/null; then + log_error "Failed to login to OSMO" + return 1 + fi + log_success "Logged in successfully" + fi +} + +# Update an OSMO config via the PATCH API (partial merge). +# When _OSMO_AUTH_BYPASS=true, uses curl; otherwise uses osmo CLI. +# Usage: osmo_config_update +# Example: osmo_config_update WORKFLOW /tmp/config.json "Configure storage" +osmo_config_update() { + local config_type="$1" + local json_file="$2" + local description="${3:-Update config}" + local port="${4:-8080}" + + if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then + local endpoint + endpoint="api/configs/$(echo "$config_type" | tr '[:upper:]' '[:lower:]')" + + # Build PATCH request body: {"description": "...", "configs_dict": } + local body + body=$(jq -n --arg desc "$description" --slurpfile cfg "$json_file" \ + '{description: $desc, configs_dict: $cfg[0]}') + + local http_code + http_code=$(osmo_curl PATCH "http://localhost:${port}/${endpoint}" \ + -d "$body" -o /tmp/_osmo_patch_resp.txt -w "%{http_code}") + + if [[ "$http_code" =~ ^2 ]]; then + return 0 + else + log_error "PATCH /${endpoint} returned HTTP ${http_code}" + cat /tmp/_osmo_patch_resp.txt 2>/dev/null || true + return 1 + fi + else + osmo config update "$config_type" --file "$json_file" --description "$description" 2>/dev/null + fi +} diff --git a/applications/osmo/deploy/002-setup/sample_osmo_realm.json b/applications/osmo/deploy/002-setup/sample_osmo_realm.json old mode 100644 new mode 100755 diff --git a/applications/osmo/deploy/002-setup/values/network-operator.yaml b/applications/osmo/deploy/002-setup/values/network-operator.yaml index eebf2e472..146a9daca 100755 --- a/applications/osmo/deploy/002-setup/values/network-operator.yaml +++ b/applications/osmo/deploy/002-setup/values/network-operator.yaml @@ -3,8 +3,13 @@ # Operator settings operator: + nodeSelector: + node-role.kubernetes.io/control-plane: "" tolerations: - - key: nvidia.com/gpu + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/control-plane operator: Exists effect: NoSchedule diff --git a/applications/osmo/deploy/002a-setup/01-deploy-gpu-infrastructure.sh b/applications/osmo/deploy/002a-setup/01-deploy-gpu-infrastructure.sh deleted file mode 100755 index ac6289b7d..000000000 --- a/applications/osmo/deploy/002a-setup/01-deploy-gpu-infrastructure.sh +++ /dev/null @@ -1,137 +0,0 @@ -#!/bin/bash -# -# Deploy GPU Infrastructure (GPU Operator, Network Operator, KAI Scheduler) -# - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/lib/common.sh" -source "${SCRIPT_DIR}/defaults.sh" - -echo "" -echo "========================================" -echo " GPU Infrastructure Deployment" -echo "========================================" -echo "" - -# Check prerequisites -check_kubectl || exit 1 -check_helm || exit 1 - -# Add Helm repos -log_info "Adding Helm repositories..." -helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --force-update -helm repo update - -# Auto-detect driverfull images from Terraform config -if [[ -z "${USE_DRIVERFULL_IMAGES:-}" ]]; then - TF_DRIVERFULL=$(get_tf_output "gpu_nodes_driverfull_image" "../001-iac" || echo "") - if [[ "$TF_DRIVERFULL" == "true" ]]; then - USE_DRIVERFULL_IMAGES="true" - log_info "Auto-detected driverfull images from Terraform" - fi -fi - -# ----------------------------------------------------------------------------- -# Deploy GPU Operator (skipped when using driverfull images) -# ----------------------------------------------------------------------------- -if [[ "${USE_DRIVERFULL_IMAGES:-false}" == "true" ]]; then - log_info "Skipping GPU Operator (using Nebius driverfull images with pre-installed drivers)" - log_info "Installing NVIDIA device plugin for driverfull mode..." - - kubectl create namespace "${GPU_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - - - # With driverfull images, we still need the GPU Operator for toolkit, device-plugin, - # dcgm, etc. - but driver installation is disabled. - helm upgrade --install gpu-operator nvidia/gpu-operator \ - --namespace "${GPU_OPERATOR_NAMESPACE}" \ - --values "${VALUES_DIR}/gpu-operator.yaml" \ - --set driver.enabled=false \ - --timeout 10m - - log_success "GPU Operator deployed (driver disabled - using driverfull images)" -else - log_info "Deploying NVIDIA GPU Operator (with driver installation)..." - - kubectl create namespace "${GPU_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - - - helm upgrade --install gpu-operator nvidia/gpu-operator \ - --namespace "${GPU_OPERATOR_NAMESPACE}" \ - --values "${VALUES_DIR}/gpu-operator.yaml" \ - --timeout 10m - - log_success "GPU Operator deployed (pods will become ready when GPU nodes are available)" -fi - -# Brief wait for core operator pod only (not GPU node components) -sleep 10 -kubectl get pods -n "${GPU_OPERATOR_NAMESPACE}" --no-headers 2>/dev/null | head -5 || true - -# ----------------------------------------------------------------------------- -# Deploy Network Operator (for InfiniBand) - OPTIONAL -# ----------------------------------------------------------------------------- -if [[ "${ENABLE_NETWORK_OPERATOR:-false}" == "true" ]]; then - log_info "Deploying NVIDIA Network Operator (InfiniBand support)..." - - kubectl create namespace "${NETWORK_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - - - helm upgrade --install network-operator nvidia/network-operator \ - --namespace "${NETWORK_OPERATOR_NAMESPACE}" \ - --values "${VALUES_DIR}/network-operator.yaml" \ - --timeout 10m - - log_success "Network Operator deployed" - - # Brief wait and show status - sleep 5 - kubectl get pods -n "${NETWORK_OPERATOR_NAMESPACE}" --no-headers 2>/dev/null | head -5 || true -else - log_info "Skipping Network Operator (set ENABLE_NETWORK_OPERATOR=true to install)" -fi - -# ----------------------------------------------------------------------------- -# Deploy KAI Scheduler (from NVIDIA OCI registry) -# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/dependencies/dependencies.html -# ----------------------------------------------------------------------------- -log_info "Deploying KAI Scheduler..." - -kubectl create namespace "${KAI_SCHEDULER_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - - -# Install directly from OCI registry -KAI_VERSION="${KAI_SCHEDULER_VERSION:-0.4.0}" -helm upgrade --install kai-scheduler \ - oci://ghcr.io/nvidia/kai-scheduler/kai-scheduler \ - --version "${KAI_VERSION}" \ - --namespace "${KAI_SCHEDULER_NAMESPACE}" \ - --values "${VALUES_DIR}/kai-scheduler.yaml" \ - --timeout 5m - -log_success "KAI Scheduler deployed" - -# Brief wait and show status -sleep 5 -kubectl get pods -n "${KAI_SCHEDULER_NAMESPACE}" --no-headers 2>/dev/null | head -5 || true - -# ----------------------------------------------------------------------------- -# Verify Installation -# ----------------------------------------------------------------------------- -echo "" -log_info "Verifying GPU infrastructure..." - -# Check GPU nodes -GPU_NODES=$(kubectl get nodes -l node-type=gpu -o name 2>/dev/null | wc -l) -if [[ $GPU_NODES -gt 0 ]]; then - log_success "Found $GPU_NODES GPU node(s)" - kubectl get nodes -l node-type=gpu -o wide -else - log_warning "No GPU nodes found yet (they may still be provisioning)" -fi - -echo "" -echo "========================================" -log_success "GPU Infrastructure deployment complete!" -echo "========================================" -echo "" -echo "Next step: ./02-deploy-observability.sh" -echo "" diff --git a/applications/osmo/deploy/002a-setup/02-deploy-observability.sh b/applications/osmo/deploy/002a-setup/02-deploy-observability.sh deleted file mode 100755 index cee09bac5..000000000 --- a/applications/osmo/deploy/002a-setup/02-deploy-observability.sh +++ /dev/null @@ -1,103 +0,0 @@ -#!/bin/bash -# -# Deploy Observability Stack (Prometheus, Grafana, Loki) -# - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/lib/common.sh" -source "${SCRIPT_DIR}/defaults.sh" - -echo "" -echo "========================================" -echo " Observability Stack Deployment" -echo "========================================" -echo "" - -# Check prerequisites -check_kubectl || exit 1 -check_helm || exit 1 - -# Add Helm repos -log_info "Adding Helm repositories..." -helm repo add prometheus-community https://prometheus-community.github.io/helm-charts --force-update -helm repo add grafana https://grafana.github.io/helm-charts --force-update -helm repo update - -# Create namespace -kubectl create namespace "${MONITORING_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - - -# Generate Grafana password if not set -if [[ -z "$GRAFANA_ADMIN_PASSWORD" ]]; then - GRAFANA_ADMIN_PASSWORD=$(openssl rand -base64 16) - log_info "Generated Grafana admin password" -fi - -# ----------------------------------------------------------------------------- -# Deploy Prometheus -# ----------------------------------------------------------------------------- -log_info "Deploying Prometheus..." - -helm upgrade --install prometheus prometheus-community/kube-prometheus-stack \ - --namespace "${MONITORING_NAMESPACE}" \ - --values "${VALUES_DIR}/prometheus.yaml" \ - --set grafana.adminPassword="${GRAFANA_ADMIN_PASSWORD}" \ - --wait --timeout 10m - -log_success "Prometheus stack deployed" - -# ----------------------------------------------------------------------------- -# Deploy Loki -# ----------------------------------------------------------------------------- -log_info "Deploying Loki..." - -helm upgrade --install loki grafana/loki-stack \ - --namespace "${MONITORING_NAMESPACE}" \ - --values "${VALUES_DIR}/loki.yaml" \ - --wait --timeout 10m - -log_success "Loki deployed" - -# ----------------------------------------------------------------------------- -# Deploy Promtail -# ----------------------------------------------------------------------------- -log_info "Deploying Promtail..." - -helm upgrade --install promtail grafana/promtail \ - --namespace "${MONITORING_NAMESPACE}" \ - --values "${VALUES_DIR}/promtail.yaml" \ - --wait --timeout 5m - -log_success "Promtail deployed" - -# ----------------------------------------------------------------------------- -# Configure Grafana Datasources -# ----------------------------------------------------------------------------- -log_info "Configuring Grafana datasources..." - -# Loki datasource is auto-configured via values - -# Wait for Grafana -wait_for_pods "${MONITORING_NAMESPACE}" "app.kubernetes.io/name=grafana" 180 - -# ----------------------------------------------------------------------------- -# Output Access Information -# ----------------------------------------------------------------------------- -echo "" -echo "========================================" -log_success "Observability stack deployment complete!" -echo "========================================" -echo "" -echo "Access Grafana:" -echo " kubectl port-forward -n ${MONITORING_NAMESPACE} svc/prometheus-grafana 3000:80" -echo " URL: http://localhost:3000" -echo " Username: admin" -echo " Password: ${GRAFANA_ADMIN_PASSWORD}" -echo "" -echo "Access Prometheus:" -echo " kubectl port-forward -n ${MONITORING_NAMESPACE} svc/prometheus-kube-prometheus-prometheus 9090:9090" -echo " URL: http://localhost:9090" -echo "" -echo "Next step: ./03-deploy-nginx-ingress.sh" -echo "" diff --git a/applications/osmo/deploy/002a-setup/03-deploy-nginx-ingress.sh b/applications/osmo/deploy/002a-setup/03-deploy-nginx-ingress.sh deleted file mode 100755 index 5ecda68d3..000000000 --- a/applications/osmo/deploy/002a-setup/03-deploy-nginx-ingress.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/bash -# -# Deploy NGINX Ingress Controller (community) -# Provides path-based routing for all OSMO services (API, router, Web UI). -# -# This installs the same controller OSMO uses elsewhere: -# - OSMO quick-start chart (Chart.yaml) depends on ingress-nginx from the same Helm repo. -# - OSMO Kind runner (run/start_service_kind.py) installs ingress-nginx the same way. -# We do not use the quick-start umbrella chart here (Nebius uses managed DB, etc.), -# so we install the controller explicitly. Not a duplicate of OSMO—same upstream chart. -# -# Run before 05-deploy-osmo-control-plane.sh. -# See: https://kubernetes.github.io/ingress-nginx/deploy/ - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/lib/common.sh" -source "${SCRIPT_DIR}/defaults.sh" - -INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}" -INGRESS_RELEASE_NAME="${INGRESS_RELEASE_NAME:-ingress-nginx}" - -echo "" -echo "========================================" -echo " NGINX Ingress Controller Deployment" -echo "========================================" -echo "" - -check_kubectl || exit 1 -check_helm || exit 1 - -# ----------------------------------------------------------------------------- -# Add Helm repo -# ----------------------------------------------------------------------------- -log_info "Adding ingress-nginx Helm repository..." -helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx --force-update -helm repo update - -# ----------------------------------------------------------------------------- -# Create namespace and install -# ----------------------------------------------------------------------------- -log_info "Creating namespace ${INGRESS_NAMESPACE}..." -kubectl create namespace "${INGRESS_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - - -log_info "Installing NGINX Ingress Controller..." -# --set controller.progressDeadlineSeconds=600: chart v4.14+ defaults to 0 which -# K8s 1.32+ rejects ("must be greater than minReadySeconds"). Without this fix the -# Deployment is invalid, the controller never starts, and the admission webhook -# blocks all Ingress resource creation in downstream scripts. -helm upgrade --install "${INGRESS_RELEASE_NAME}" ingress-nginx/ingress-nginx \ - --namespace "${INGRESS_NAMESPACE}" \ - --set controller.service.type=LoadBalancer \ - --set controller.progressDeadlineSeconds=600 \ - --wait --timeout 5m || { - log_warning "Helm install returned non-zero; controller may still be starting." -} - -log_success "NGINX Ingress Controller deployed" - -# ----------------------------------------------------------------------------- -# Wait for LoadBalancer IP (optional; may take 1–2 min on cloud) -# ----------------------------------------------------------------------------- -log_info "Waiting for LoadBalancer IP (up to 120s)..." -for i in $(seq 1 24); do - LB_IP=$(kubectl get svc -n "${INGRESS_NAMESPACE}" -l app.kubernetes.io/name=ingress-nginx -o jsonpath='{.items[0].status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) - if [[ -n "$LB_IP" ]]; then - log_success "LoadBalancer IP: ${LB_IP}" - echo "" - echo "OSMO will be accessible at:" - echo " http://${LB_IP}" - echo "" - echo "This URL is auto-detected by 05-deploy-osmo-control-plane.sh." - echo "" - break - fi - sleep 5 -done -if [[ -z "${LB_IP:-}" ]]; then - log_warning "LoadBalancer IP not yet assigned. Check: kubectl get svc -n ${INGRESS_NAMESPACE}" -fi - -echo "========================================" -log_success "NGINX Ingress deployment complete" -echo "========================================" -echo "" -echo "Next: run 04-enable-tls.sh (optional, recommended)" -echo " then 05-deploy-osmo-control-plane.sh" -echo "" diff --git a/applications/osmo/deploy/002a-setup/04-enable-tls.sh b/applications/osmo/deploy/002a-setup/04-enable-tls.sh deleted file mode 100755 index cbc6cb7de..000000000 --- a/applications/osmo/deploy/002a-setup/04-enable-tls.sh +++ /dev/null @@ -1,441 +0,0 @@ -#!/bin/bash -# -# Enable TLS/HTTPS using cert-manager + Let's Encrypt -# -# Can be run at two points in the deployment flow: -# -# A) Right after 03-deploy-nginx-ingress.sh (RECOMMENDED): -# Installs cert-manager, issues the TLS certificate early. -# When 05-deploy-osmo-control-plane.sh runs later, it auto-detects the -# certificate and creates TLS-enabled Ingress resources from the start. -# -# B) After 05-deploy-osmo-control-plane.sh (retrofit existing deployment): -# Does everything in (A) plus patches existing OSMO Ingress resources -# and updates service_base_url to HTTPS. -# -# Prerequisites: -# 1. NGINX Ingress Controller deployed (03-deploy-nginx-ingress.sh) -# 2. A DNS A record pointing your domain to the LoadBalancer IP -# -# Usage: -# ./04-enable-tls.sh -# -# Example: -# ./04-enable-tls.sh vl51.eu-north1.osmo.nebius.cloud -# -# Optional environment variables: -# OSMO_TLS_EMAIL - Email for Let's Encrypt expiry notices (default: noreply@) -# OSMO_TLS_SECRET_NAME - K8s Secret name for certificate (default: osmo-tls) -# - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/lib/common.sh" -source "${SCRIPT_DIR}/defaults.sh" - -HOSTNAME="${1:-${OSMO_INGRESS_HOSTNAME:-}}" -HOSTNAME="${HOSTNAME%.}" # Strip trailing dot (FQDN notation) -TLS_SECRET="${OSMO_TLS_SECRET_NAME:-osmo-tls}" -OSMO_NS="${OSMO_NAMESPACE:-osmo}" -INGRESS_NS="${INGRESS_NAMESPACE:-ingress-nginx}" - -echo "" -echo "========================================" -echo " Enable TLS/HTTPS" -echo "========================================" -echo "" - -# ----------------------------------------------------------------------------- -# Validate inputs -# ----------------------------------------------------------------------------- -if [[ -z "$HOSTNAME" ]]; then - log_error "Usage: $0 " - echo "" - echo "Example: $0 vl51.eu-north1.osmo.nebius.cloud" - echo "" - LB_IP=$(kubectl get svc -n "${INGRESS_NS}" ingress-nginx-controller \ - -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) - if [[ -n "$LB_IP" ]]; then - echo "Your LoadBalancer IP is: ${LB_IP}" - echo "Create a DNS A record pointing your domain to this IP, then re-run this script." - fi - exit 1 -fi - -check_kubectl || exit 1 -check_helm || exit 1 - -log_info "Hostname: ${HOSTNAME}" -log_info "TLS secret: ${TLS_SECRET}" - -# Keycloak auth subdomain support -DEPLOY_KEYCLOAK="${DEPLOY_KEYCLOAK:-false}" -KC_TLS_SECRET="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" -AUTH_HOSTNAME="" -if [[ "$DEPLOY_KEYCLOAK" == "true" ]]; then - if [[ -n "${KEYCLOAK_HOSTNAME:-}" ]]; then - AUTH_HOSTNAME="${KEYCLOAK_HOSTNAME}" - else - AUTH_HOSTNAME="auth.${HOSTNAME}" - fi - log_info "Keycloak auth hostname: ${AUTH_HOSTNAME}" - log_info "Keycloak TLS secret: ${KC_TLS_SECRET}" -fi - -# Get LoadBalancer IP -LB_IP=$(kubectl get svc -n "${INGRESS_NS}" ingress-nginx-controller \ - -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) - -# Prompt user to set up DNS records before proceeding -echo "" -echo "========================================" -echo " DNS Record Setup Required" -echo "========================================" -echo "" -if [[ -n "$LB_IP" ]]; then - echo "Create the following DNS A record(s) pointing to your LoadBalancer IP:" - echo "" - echo " ${HOSTNAME} -> ${LB_IP}" - if [[ -n "$AUTH_HOSTNAME" ]]; then - echo " ${AUTH_HOSTNAME} -> ${LB_IP}" - fi -else - echo "LoadBalancer IP not yet assigned. Check with:" - echo " kubectl get svc -n ${INGRESS_NS} ingress-nginx-controller" - echo "" - echo "Once the IP is available, create DNS A record(s) for:" - echo " ${HOSTNAME}" - if [[ -n "$AUTH_HOSTNAME" ]]; then - echo " ${AUTH_HOSTNAME}" - fi -fi -echo "" -echo "Let's Encrypt HTTP-01 challenges require DNS to resolve to the LoadBalancer." -echo "" -read_prompt_var "Press Enter once DNS records are configured (or type 'skip' to skip DNS check)" DNS_CONFIRM "" - -# Verify DNS resolves to the LoadBalancer IP -if [[ "$DNS_CONFIRM" != "skip" ]]; then - DNS_IP=$(dig +short "$HOSTNAME" 2>/dev/null | tail -1 || true) - - if [[ -n "$LB_IP" && -n "$DNS_IP" ]]; then - if [[ "$DNS_IP" == "$LB_IP" ]]; then - log_success "DNS check: ${HOSTNAME} -> ${DNS_IP} (matches LoadBalancer)" - else - log_warning "DNS mismatch: ${HOSTNAME} -> ${DNS_IP}, but LoadBalancer IP is ${LB_IP}" - log_warning "Let's Encrypt HTTP-01 challenge may fail if DNS doesn't point to the LoadBalancer." - fi - elif [[ -z "$DNS_IP" ]]; then - log_warning "Could not resolve ${HOSTNAME}. Make sure the DNS record exists." - fi - - if [[ -n "$AUTH_HOSTNAME" ]]; then - AUTH_DNS_IP=$(dig +short "$AUTH_HOSTNAME" 2>/dev/null | tail -1 || true) - if [[ -n "$LB_IP" && -n "$AUTH_DNS_IP" ]]; then - if [[ "$AUTH_DNS_IP" == "$LB_IP" ]]; then - log_success "DNS check: ${AUTH_HOSTNAME} -> ${AUTH_DNS_IP} (matches LoadBalancer)" - else - log_warning "DNS mismatch: ${AUTH_HOSTNAME} -> ${AUTH_DNS_IP}, but LoadBalancer IP is ${LB_IP}" - fi - elif [[ -z "$AUTH_DNS_IP" ]]; then - log_warning "Could not resolve ${AUTH_HOSTNAME}. Keycloak TLS cert may fail." - fi - fi -fi - -# Check if OSMO is already deployed (determines whether to patch Ingress / update config) -INGRESS_COUNT=$(kubectl get ingress -n "${OSMO_NS}" --no-headers 2>/dev/null | wc -l | tr -d ' ') -if [[ "$INGRESS_COUNT" -gt 0 ]]; then - log_info "Found ${INGRESS_COUNT} Ingress resource(s) in ${OSMO_NS} (will patch with TLS)" - OSMO_DEPLOYED="true" -else - log_info "No OSMO Ingress resources yet — preparing cert-manager and certificate" - log_info "Step 05 will auto-detect the TLS cert and create HTTPS Ingress" - OSMO_DEPLOYED="false" -fi - -# ----------------------------------------------------------------------------- -# Step 1: Install cert-manager -# ----------------------------------------------------------------------------- -log_info "Installing cert-manager..." -helm repo add jetstack https://charts.jetstack.io --force-update -helm repo update jetstack - -if helm status cert-manager -n cert-manager &>/dev/null; then - log_info "cert-manager already installed" -else - helm install cert-manager jetstack/cert-manager \ - --namespace cert-manager --create-namespace \ - --set crds.enabled=true \ - --wait --timeout 5m -fi -log_success "cert-manager ready" - -# ----------------------------------------------------------------------------- -# Step 2: Create Let's Encrypt ClusterIssuer -# ----------------------------------------------------------------------------- -TLS_EMAIL="${OSMO_TLS_EMAIL:-noreply@${HOSTNAME#*.}}" -log_info "Creating Let's Encrypt ClusterIssuer (email: ${TLS_EMAIL})..." - -kubectl apply -f - </dev/null); do - ing_name="${ing#*/}" - CURRENT_HTTP=$(kubectl get "$ing" -n "${OSMO_NS}" -o jsonpath='{.spec.rules[0].http}') - - kubectl patch "$ing" -n "${OSMO_NS}" --type=merge -p "$(cat </dev/null || echo "") - if [[ "$CERT_READY" == "True" ]]; then - log_success "TLS certificate issued and ready" - break - fi - sleep 5 -done - -if [[ "$CERT_READY" != "True" ]]; then - log_warning "Certificate not ready yet. Checking status..." - kubectl describe certificate "${TLS_SECRET}" -n "${OSMO_NS}" 2>/dev/null | tail -10 - echo "" - log_info "It may take a few more minutes. Check with:" - echo " kubectl get certificate -n ${OSMO_NS}" - echo " kubectl describe challenge -n ${OSMO_NS}" -fi - -# ----------------------------------------------------------------------------- -# Step 4b: Issue TLS certificate for Keycloak auth subdomain (if DEPLOY_KEYCLOAK=true) -# ----------------------------------------------------------------------------- -if [[ -n "$AUTH_HOSTNAME" ]]; then - log_info "Issuing TLS certificate for Keycloak auth subdomain: ${AUTH_HOSTNAME}..." - - # Create bootstrap Ingress for auth subdomain (to trigger HTTP-01 challenge) - kubectl apply -f - </dev/null || echo "") - if [[ "$AUTH_CERT_READY" == "True" ]]; then - log_success "Auth TLS certificate issued and ready" - break - fi - sleep 5 - done - - if [[ "$AUTH_CERT_READY" != "True" ]]; then - log_warning "Auth certificate not ready yet. It may take a few more minutes." - log_info "Check with: kubectl get certificate ${KC_TLS_SECRET} -n ${OSMO_NS}" - fi - - # Clean up the bootstrap Ingress once the certificate is issued. - # If left in place, the NGINX admission webhook will reject any Helm chart - # (e.g. Keycloak) that tries to create an ingress for the same host+path. - log_info "Removing auth bootstrap ingress (certificate provisioned)..." - kubectl delete ingress osmo-tls-auth-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null -fi - -# ----------------------------------------------------------------------------- -# Step 5: Update OSMO service_base_url to HTTPS (only if OSMO is deployed) -# ----------------------------------------------------------------------------- -if [[ "$OSMO_DEPLOYED" == "true" ]]; then - log_info "Updating OSMO service_base_url to https://${HOSTNAME}..." - - kubectl port-forward -n "${OSMO_NS}" svc/osmo-service 8080:80 &>/dev/null & - _PF_PID=$! - trap 'kill $_PF_PID 2>/dev/null; wait $_PF_PID 2>/dev/null' EXIT - - # Wait for port-forward - _pf_ready=false - for i in $(seq 1 15); do - if curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/version" 2>/dev/null | grep -q "200\|401\|403"; then - _pf_ready=true - break - fi - sleep 1 - done - - if [[ "$_pf_ready" == "true" ]]; then - if osmo login http://localhost:8080 --method dev --username admin 2>/dev/null; then - cat > /tmp/service_url_tls.json </dev/null; then - NEW_URL=$(curl -s "http://localhost:8080/api/configs/service" 2>/dev/null | jq -r '.service_base_url // ""') - log_success "service_base_url updated to: ${NEW_URL}" - else - log_warning "Could not update service_base_url automatically." - log_info "Run: ./08-configure-service-url.sh https://${HOSTNAME}" - fi - rm -f /tmp/service_url_tls.json - else - log_warning "Could not login to OSMO API. Update service_base_url manually:" - log_info " ./08-configure-service-url.sh https://${HOSTNAME}" - fi - else - log_warning "Could not connect to OSMO API. Update service_base_url manually:" - log_info " ./08-configure-service-url.sh https://${HOSTNAME}" - fi -else - log_info "Skipping service_base_url update (OSMO not deployed yet)" - log_info "Step 05 will auto-detect TLS and use https:// for service_base_url" -fi - -# ----------------------------------------------------------------------------- -# Step 6: Clean up bootstrap Ingress (certificate already provisioned) -# ----------------------------------------------------------------------------- -# Always remove the bootstrap ingress once certs are issued. If left in place, -# the NGINX admission webhook will reject any Helm chart (e.g. osmo-ui) that -# tries to create an ingress for the same host+path. -log_info "Removing main bootstrap ingress (certificate provisioned)..." -kubectl delete ingress osmo-tls-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null - -# ----------------------------------------------------------------------------- -# Done -# ----------------------------------------------------------------------------- -echo "" -echo "========================================" -log_success "TLS setup complete" -echo "========================================" -echo "" - -if [[ "$OSMO_DEPLOYED" == "true" ]]; then - echo "OSMO is now accessible at:" - echo " https://${HOSTNAME}" - echo " https://${HOSTNAME}/api/version" - echo "" - echo "CLI login:" - echo " osmo login https://${HOSTNAME} --method dev --username admin" -else - echo "TLS certificate prepared for: ${HOSTNAME}" - if [[ -n "$AUTH_HOSTNAME" ]]; then - echo "Auth TLS certificate prepared for: ${AUTH_HOSTNAME}" - fi - echo "" - echo "Next steps:" - echo " 1. Wait for certificate(s) to be ready: kubectl get certificate -n ${OSMO_NS}" - echo " 2. Deploy OSMO: ./05-deploy-osmo-control-plane.sh" - echo " (It will auto-detect the TLS cert and create HTTPS Ingress)" - if [[ -n "$AUTH_HOSTNAME" ]]; then - echo " 3. Deploy with Keycloak: DEPLOY_KEYCLOAK=true ./05-deploy-osmo-control-plane.sh" - echo " (Keycloak will be exposed at https://${AUTH_HOSTNAME})" - fi -fi -echo "" diff --git a/applications/osmo/deploy/002a-setup/README.md b/applications/osmo/deploy/002a-setup/README.md deleted file mode 100755 index 05ec8b55c..000000000 --- a/applications/osmo/deploy/002a-setup/README.md +++ /dev/null @@ -1,363 +0,0 @@ -# Kubernetes Setup Scripts - -This directory contains scripts for configuring the Kubernetes cluster with GPU infrastructure and OSMO components. - -## Prerequisites - -1. Complete infrastructure deployment (001-iac) -2. kubectl configured with cluster access: - ```bash - nebius mk8s cluster get-credentials --id --external - ``` - -## Deployment Order - -Run scripts in order: - -```bash -# 1. GPU Infrastructure (GPU Operator, Network Operator, KAI Scheduler) -./01-deploy-gpu-infrastructure.sh - -# 2. Observability (Prometheus, Grafana, Loki) -./02-deploy-observability.sh - -# 3. NGINX Ingress Controller (required – provides routing for OSMO services) -./03-deploy-nginx-ingress.sh - -# 4. OSMO Control Plane -./04-deploy-osmo-control-plane.sh - -# 5. OSMO Backend -./05-deploy-osmo-backend.sh - -# 6. Configure Storage (requires port-forward, see main README) -./06-configure-storage.sh - -# 7. Configure GPU Platform (required for GPU workflows) -./08-configure-gpu-platform.sh -``` - -## Scripts - -| Script | Purpose | Duration | -|--------|---------|----------| -| `01-deploy-gpu-infrastructure.sh` | GPU Operator, Network Operator, KAI Scheduler | ~15 min | -| `02-deploy-observability.sh` | Prometheus, Grafana, Loki, Promtail | ~10 min | -| `03-deploy-nginx-ingress.sh` | NGINX Ingress Controller (routing for OSMO services) | ~2 min | -| `04-deploy-osmo-control-plane.sh` | OSMO Control Plane, Ingress resources, database secrets, service URL | ~5 min | -| `05-deploy-osmo-backend.sh` | OSMO Backend operator | ~5 min | -| `06-configure-storage.sh` | Configure S3-compatible storage for workflow logs/data | ~1 min | -| `07-configure-service-url.sh` | Reconfigure service URL manually (usually not needed) | ~1 min | -| `08-configure-gpu-platform.sh` | Configure GPU platform with tolerations/node selector | ~1 min | - -## Configuration - -### Helm Values - -Customize deployments by editing files in `values/`: - -| File | Component | -|------|-----------| -| `gpu-operator.yaml` | NVIDIA GPU Operator | -| `network-operator.yaml` | NVIDIA Network Operator | -| `kai-scheduler.yaml` | KAI GPU Scheduler | -| `prometheus.yaml` | Prometheus + Grafana | -| `loki.yaml` | Loki Log Aggregation | -| `promtail.yaml` | Log Collection | - -### Environment Variables - -Configure via `defaults.sh` or export before running: - -```bash -# Namespaces -GPU_OPERATOR_NAMESPACE="gpu-operator" -NETWORK_OPERATOR_NAMESPACE="network-operator" -MONITORING_NAMESPACE="monitoring" -OSMO_NAMESPACE="osmo" - -# Grafana password (auto-generated if empty) -GRAFANA_ADMIN_PASSWORD="" - -# NGINX Ingress (deploy 03-deploy-nginx-ingress.sh before 04-deploy-osmo-control-plane.sh) -OSMO_INGRESS_HOSTNAME="" # hostname for Ingress rules (e.g. osmo.example.com); leave empty for IP-based access -OSMO_INGRESS_BASE_URL="" # override for service_base_url; auto-detected from LoadBalancer if empty -``` - -### Secrets from MysteryBox - -If you ran `secrets-init.sh` in the prerequisites step, the following environment variables are set: - -| Variable | Description | -|----------|-------------| -| `TF_VAR_postgresql_mysterybox_secret_id` | MysteryBox secret ID for PostgreSQL password | -| `TF_VAR_mek_mysterybox_secret_id` | MysteryBox secret ID for MEK (Master Encryption Key) | - -The `04-deploy-osmo-control-plane.sh` script automatically reads these secrets from MysteryBox. This keeps sensitive credentials out of Terraform state and provides a secure secrets management workflow. - -**Secret retrieval order:** -1. **MysteryBox** (if secret ID is set via `TF_VAR_*` or `OSMO_*` env vars) -2. **Terraform outputs** (fallback) -3. **Environment variables** (fallback) -4. **Interactive prompt** (last resort) - -To manually retrieve secrets from MysteryBox: -```bash -# PostgreSQL password -nebius mysterybox v1 payload get-by-key \ - --secret-id $TF_VAR_postgresql_mysterybox_secret_id \ - --key password --format json | jq -r '.data.string_value' - -# MEK (Master Encryption Key) -nebius mysterybox v1 payload get-by-key \ - --secret-id $TF_VAR_mek_mysterybox_secret_id \ - --key mek --format json | jq -r '.data.string_value' -``` - -## Accessing Services - -### Grafana Dashboard - -```bash -kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80 -# Open http://localhost:3000 -# User: admin -# Password: (shown during deployment or in defaults.sh) -``` - -### Prometheus - -```bash -kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 -# Open http://localhost:9090 -``` - -### OSMO API - -```bash -kubectl port-forward -n osmo svc/osmo-service 8080:80 -# Open http://localhost:8080 -``` - -### OSMO Web UI - -```bash -kubectl port-forward -n osmo svc/osmo-ui 8081:80 -# Open http://localhost:8081 -``` - -## Cleanup - -Run cleanup scripts in reverse order: - -```bash -cd cleanup - -# Remove OSMO -./uninstall-osmo-backend.sh -./uninstall-osmo-control-plane.sh - -# Remove observability -./uninstall-observability.sh - -# Remove GPU infrastructure -./uninstall-gpu-infrastructure.sh -``` - -## Configure OSMO GPU Platform - -After deploying OSMO backend, configure the GPU platform so OSMO can schedule workloads on GPU nodes. - -### Why is this needed? - -Nebius GPU nodes have a taint `nvidia.com/gpu=true:NoSchedule` that prevents pods from being scheduled unless they have matching tolerations. OSMO needs to be configured with: - -1. A **pod template** with GPU tolerations and node selector -2. A **GPU platform** that references this pod template - -### Option 1: Run the Configuration Script (Recommended) - -```bash -./08-configure-gpu-platform.sh -``` - -### Option 2: Manual Configuration via API - -With port-forward running (`kubectl port-forward -n osmo svc/osmo-service 8080:80`): - -**Step 1: Create GPU Pod Template** - -```bash -curl -X PUT 'http://localhost:8080/api/configs/pod_template/gpu_tolerations' \ - -H 'Content-Type: application/json' \ - -d @gpu_pod_template.json -``` - -Where `gpu_pod_template.json` contains: - -```json -{ - "configs": { - "spec": { - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "nodeSelector": { - "nvidia.com/gpu.present": "true" - } - } - } -} -``` - -**Step 2: Create GPU Platform** - -```bash -curl -X PUT 'http://localhost:8080/api/configs/pool/default/platform/gpu' \ - -H 'Content-Type: application/json' \ - -d @gpu_platform_update.json -``` - -Where `gpu_platform_update.json` contains: - -```json -{ - "configs": { - "description": "GPU platform for L40S nodes", - "host_network_allowed": false, - "privileged_allowed": false, - "allowed_mounts": [], - "default_mounts": [], - "default_variables": { - "USER_GPU": 1 - }, - "resource_validations": [], - "override_pod_template": ["gpu_tolerations"] - } -} -``` - -### Verify Configuration - -```bash -# Check pod templates -curl -s http://localhost:8080/api/configs/pod_template | jq 'keys' -# Should include: "gpu_tolerations" - -# Check GPU platform -curl -s http://localhost:8080/api/configs/pool/default | jq '.platforms.gpu' - -# Check resources (GPU nodes should now be visible) -curl -s http://localhost:8080/api/resources | jq '.resources[] | {name: .name, gpu: .allocatable_fields.gpu}' -``` - -### Using GPU in Workflows - -Specify `platform: gpu` in your OSMO workflow: - -```yaml -workflow: - name: my-gpu-job - resources: - gpu-resource: - platform: gpu # <-- Selects GPU platform with tolerations - gpu: 1 - memory: 4Gi - tasks: - - name: train - image: nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04 - command: ["nvidia-smi"] - resource: gpu-resource -``` - -## Troubleshooting - -### GPU Nodes Not Ready - -1. Check GPU operator pods: - ```bash - kubectl get pods -n gpu-operator - ``` - -2. Check node labels: - ```bash - kubectl get nodes -l node-type=gpu --show-labels - ``` - -3. Check DCGM exporter: - ```bash - kubectl logs -n gpu-operator -l app=nvidia-dcgm-exporter - ``` - -### Pods Pending on GPU Nodes - -1. Verify tolerations: - ```bash - kubectl describe pod | grep -A5 Tolerations - ``` - -2. Check node taints: - ```bash - kubectl describe node | grep Taints - ``` - -### InfiniBand Issues - -1. Check Network Operator: - ```bash - kubectl get pods -n network-operator - ``` - -2. Verify RDMA devices: - ```bash - kubectl exec -n gpu-operator -- ibstat - ``` - -### Database Connection Failed - -1. Verify PostgreSQL is accessible: - ```bash - kubectl get secret osmo-database -n osmo -o yaml - ``` - -2. Test connection from a pod: - ```bash - kubectl run pg-test --rm -it --image=postgres:16 -- psql -h -U -d - ``` - -### OSMO Not Seeing GPU Resources - -If OSMO shows 0 GPUs or GPU workflows fail to schedule: - -1. Check if GPU platform is configured: - ```bash - curl -s http://localhost:8080/api/configs/pool/default | jq '.platforms | keys' - # Should include "gpu" - ``` - -2. Check if GPU pod template exists: - ```bash - curl -s http://localhost:8080/api/configs/pod_template | jq 'keys' - # Should include "gpu_tolerations" - ``` - -3. Check GPU node labels and taints: - ```bash - kubectl describe node | grep -E 'Taints:|nvidia.com/gpu' - # Should show taint: nvidia.com/gpu=true:NoSchedule - # Should show label: nvidia.com/gpu.present=true - ``` - -4. If missing, run the GPU configuration: - ```bash - ./08-configure-gpu-platform.sh - ``` - -5. Verify OSMO sees GPU resources: - ```bash - curl -s http://localhost:8080/api/resources | jq '.resources[] | select(.allocatable_fields.gpu != null)' - ``` diff --git a/applications/osmo/deploy/002a-setup/cleanup/uninstall-gpu-infrastructure.sh b/applications/osmo/deploy/002a-setup/cleanup/uninstall-gpu-infrastructure.sh deleted file mode 100755 index de869a0cf..000000000 --- a/applications/osmo/deploy/002a-setup/cleanup/uninstall-gpu-infrastructure.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# -# Uninstall GPU Infrastructure -# - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/../lib/common.sh" -source "${SCRIPT_DIR}/../defaults.sh" - -echo "" -echo "========================================" -echo " Uninstalling GPU Infrastructure" -echo "========================================" -echo "" - -log_warning "This will remove GPU Operator, Network Operator, and KAI Scheduler" -read_prompt_var "Continue? (y/N)" confirm "" -if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then - log_info "Cancelled" - exit 0 -fi - -log_info "Removing KAI Scheduler..." -helm uninstall kai-scheduler -n "${KAI_SCHEDULER_NAMESPACE}" 2>/dev/null || true -kubectl delete namespace "${KAI_SCHEDULER_NAMESPACE}" --ignore-not-found - -log_info "Removing Network Operator..." -helm uninstall network-operator -n "${NETWORK_OPERATOR_NAMESPACE}" 2>/dev/null || true -kubectl delete namespace "${NETWORK_OPERATOR_NAMESPACE}" --ignore-not-found - -log_info "Removing GPU Operator..." -helm uninstall gpu-operator -n "${GPU_OPERATOR_NAMESPACE}" 2>/dev/null || true - -# Remove GPU Operator CRDs -log_info "Removing GPU Operator CRDs..." -kubectl delete crd clusterpolicies.nvidia.com --ignore-not-found -kubectl delete crd nvidiadrivers.nvidia.com --ignore-not-found - -kubectl delete namespace "${GPU_OPERATOR_NAMESPACE}" --ignore-not-found - -log_success "GPU infrastructure uninstalled" diff --git a/applications/osmo/deploy/002a-setup/cleanup/uninstall-keycloak.sh b/applications/osmo/deploy/002a-setup/cleanup/uninstall-keycloak.sh deleted file mode 100755 index caeaa8b74..000000000 --- a/applications/osmo/deploy/002a-setup/cleanup/uninstall-keycloak.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash -# Uninstall Keycloak and disable OSMO authentication -# This removes Keycloak and related secrets. After running this, re-deploy -# OSMO control plane without DEPLOY_KEYCLOAK to switch back to open API mode. -set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -source "${SCRIPT_DIR}/lib/common.sh" -source "${SCRIPT_DIR}/defaults.sh" - -OSMO_NS="${OSMO_NAMESPACE:-osmo}" -KC_TLS_SECRET="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" -INGRESS_NS="${INGRESS_NAMESPACE:-ingress-nginx}" - -echo "" -echo "========================================" -echo " Uninstall Keycloak" -echo "========================================" -echo "" - -check_kubectl || exit 1 -check_helm || exit 1 - -# Step 1: Uninstall Keycloak Helm release -log_info "Uninstalling Keycloak Helm release..." -helm uninstall keycloak --namespace "${OSMO_NS}" 2>/dev/null || log_info "Keycloak Helm release not found (already removed)" - -# Step 2: Delete Keycloak config job and realm ConfigMap -log_info "Cleaning up Keycloak configuration job and ConfigMap..." -kubectl delete job keycloak-osmo-setup -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true -kubectl delete configmap keycloak-realm-json -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true - -# Step 3: Delete Keycloak-related secrets -log_info "Deleting Keycloak secrets..." -kubectl delete secret keycloak-admin-secret -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true -kubectl delete secret keycloak-db-secret -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true -kubectl delete secret oidc-secrets -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true -log_success "Keycloak secrets deleted" - -# Step 4: Delete Keycloak TLS secret -log_info "Deleting Keycloak TLS secret (${KC_TLS_SECRET})..." -kubectl delete secret "${KC_TLS_SECRET}" -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true -kubectl delete secret "${KC_TLS_SECRET}" -n "${INGRESS_NS}" --ignore-not-found 2>/dev/null || true -log_success "Keycloak TLS secrets deleted" - -# Step 5: Delete Keycloak PVCs (if any) -log_info "Cleaning up Keycloak PVCs..." -kubectl delete pvc -l app.kubernetes.io/name=keycloak -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true - -echo "" -log_success "Keycloak uninstalled" -echo "" -echo "Next steps:" -echo " 1. Re-deploy OSMO control plane without authentication:" -echo " unset DEPLOY_KEYCLOAK" -echo " ./05-deploy-osmo-control-plane.sh" -echo "" -echo " 2. (Optional) Drop the Keycloak database from PostgreSQL:" -echo " Connect to your Managed PostgreSQL and run:" -echo " DROP DATABASE IF EXISTS keycloak;" -echo "" -echo " 3. (Optional) Remove the DNS A record for the auth subdomain" -echo "" diff --git a/applications/osmo/deploy/002a-setup/cleanup/uninstall-nginx-ingress.sh b/applications/osmo/deploy/002a-setup/cleanup/uninstall-nginx-ingress.sh deleted file mode 100755 index 471029d5c..000000000 --- a/applications/osmo/deploy/002a-setup/cleanup/uninstall-nginx-ingress.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -# Uninstall NGINX Ingress Controller (deployed by 03-deploy-nginx-ingress.sh) -set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -source "${SCRIPT_DIR}/lib/common.sh" -INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}" -INGRESS_RELEASE_NAME="${INGRESS_RELEASE_NAME:-ingress-nginx}" -log_info "Uninstalling NGINX Ingress Controller..." -helm uninstall "${INGRESS_RELEASE_NAME}" -n "${INGRESS_NAMESPACE}" 2>/dev/null || true -kubectl delete namespace "${INGRESS_NAMESPACE}" --ignore-not-found --timeout=60s 2>/dev/null || true -log_success "NGINX Ingress Controller uninstalled" - -# Uninstall cert-manager (if installed) -if helm status cert-manager -n cert-manager &>/dev/null; then - log_info "Uninstalling cert-manager..." - kubectl delete clusterissuer letsencrypt --ignore-not-found 2>/dev/null || true - helm uninstall cert-manager -n cert-manager 2>/dev/null || true - kubectl delete namespace cert-manager --ignore-not-found --timeout=60s 2>/dev/null || true - log_success "cert-manager uninstalled" -fi diff --git a/applications/osmo/deploy/002a-setup/cleanup/uninstall-observability.sh b/applications/osmo/deploy/002a-setup/cleanup/uninstall-observability.sh deleted file mode 100755 index e847de5a6..000000000 --- a/applications/osmo/deploy/002a-setup/cleanup/uninstall-observability.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash -# -# Uninstall Observability Stack -# - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/../lib/common.sh" -source "${SCRIPT_DIR}/../defaults.sh" - -echo "" -echo "========================================" -echo " Uninstalling Observability Stack" -echo "========================================" -echo "" - -log_warning "This will remove Prometheus, Grafana, and Loki" -# Read input with a prompt into a variable (bash/zsh compatible). -read_prompt_var() { - local prompt=$1 - local var_name=$2 - local default=$3 - local value="" - local read_from="/dev/tty" - local write_to="/dev/tty" - - if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then - read_from="/dev/stdin" - write_to="/dev/stdout" - fi - - if [[ -n "$default" ]]; then - printf "%s [%s]: " "$prompt" "$default" >"$write_to" - else - printf "%s: " "$prompt" >"$write_to" - fi - - IFS= read -r value <"$read_from" - if [[ -z "$value" && -n "$default" ]]; then - value="$default" - fi - - eval "$var_name='$value'" -} - -read_prompt_var "Continue? (y/N)" confirm "" -if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then - log_info "Cancelled" - exit 0 -fi - -log_info "Removing Promtail..." -helm uninstall promtail -n "${MONITORING_NAMESPACE}" 2>/dev/null || true - -log_info "Removing Loki..." -helm uninstall loki -n "${MONITORING_NAMESPACE}" 2>/dev/null || true - -log_info "Removing Prometheus stack..." -helm uninstall prometheus -n "${MONITORING_NAMESPACE}" 2>/dev/null || true - -# Remove CRDs -log_info "Removing Prometheus CRDs..." -kubectl delete crd alertmanagerconfigs.monitoring.coreos.com --ignore-not-found -kubectl delete crd alertmanagers.monitoring.coreos.com --ignore-not-found -kubectl delete crd podmonitors.monitoring.coreos.com --ignore-not-found -kubectl delete crd probes.monitoring.coreos.com --ignore-not-found -kubectl delete crd prometheuses.monitoring.coreos.com --ignore-not-found -kubectl delete crd prometheusrules.monitoring.coreos.com --ignore-not-found -kubectl delete crd servicemonitors.monitoring.coreos.com --ignore-not-found -kubectl delete crd thanosrulers.monitoring.coreos.com --ignore-not-found - -log_info "Removing monitoring namespace..." -kubectl delete namespace "${MONITORING_NAMESPACE}" --ignore-not-found - -log_success "Observability stack uninstalled" diff --git a/applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-backend.sh b/applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-backend.sh deleted file mode 100755 index cce604c99..000000000 --- a/applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-backend.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash -# -# Uninstall OSMO Backend Operator -# Reverses everything deployed by 06-deploy-osmo-backend.sh -# - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/../lib/common.sh" -source "${SCRIPT_DIR}/../defaults.sh" - -OSMO_OPERATOR_NAMESPACE="osmo-operator" -OSMO_WORKFLOWS_NAMESPACE="osmo-workflows" - -echo "" -echo "========================================" -echo " Uninstalling OSMO Backend Operator" -echo "========================================" -echo "" - -log_warning "This will remove:" -echo " - Helm release: osmo-operator (namespace: ${OSMO_OPERATOR_NAMESPACE})" -echo " - Secret: osmo-operator-token (namespace: ${OSMO_OPERATOR_NAMESPACE})" -echo " - Namespace: ${OSMO_OPERATOR_NAMESPACE}" -echo " - Namespace: ${OSMO_WORKFLOWS_NAMESPACE} (and all workflow pods)" -echo "" -read_prompt_var "Continue? (y/N)" confirm "" -if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then - log_info "Cancelled" - exit 0 -fi - -# Uninstall Helm release -if helm status osmo-operator -n "${OSMO_OPERATOR_NAMESPACE}" &>/dev/null; then - log_info "Uninstalling Helm release: osmo-operator..." - helm uninstall osmo-operator -n "${OSMO_OPERATOR_NAMESPACE}" --wait --timeout 5m - log_success "Helm release uninstalled" -else - log_info "Helm release osmo-operator not found — skipping" -fi - -# Delete secrets -log_info "Removing secrets..." -kubectl delete secret osmo-operator-token -n "${OSMO_OPERATOR_NAMESPACE}" --ignore-not-found - -# Delete the internal agent service (created by 05-deploy-osmo-control-plane.sh for backend operator) -log_info "Removing osmo-agent-internal service..." -kubectl delete svc osmo-agent-internal -n "${OSMO_NAMESPACE}" --ignore-not-found - -# Delete namespaces (this also removes any remaining resources inside them) -log_info "Deleting namespace: ${OSMO_WORKFLOWS_NAMESPACE}..." -kubectl delete namespace "${OSMO_WORKFLOWS_NAMESPACE}" --ignore-not-found --wait=false - -log_info "Deleting namespace: ${OSMO_OPERATOR_NAMESPACE}..." -kubectl delete namespace "${OSMO_OPERATOR_NAMESPACE}" --ignore-not-found --wait=false - -echo "" -log_success "OSMO Backend Operator uninstalled" -echo "" -echo "Note: Namespace deletion may continue in the background." -echo " kubectl get ns ${OSMO_OPERATOR_NAMESPACE} ${OSMO_WORKFLOWS_NAMESPACE} 2>/dev/null" -echo "" diff --git a/applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-control-plane.sh b/applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-control-plane.sh deleted file mode 100755 index 0abb5f560..000000000 --- a/applications/osmo/deploy/002a-setup/cleanup/uninstall-osmo-control-plane.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -# -# Uninstall OSMO Control Plane -# - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "${SCRIPT_DIR}/../lib/common.sh" -source "${SCRIPT_DIR}/../defaults.sh" - -echo "" -echo "========================================" -echo " Uninstalling OSMO Control Plane" -echo "========================================" -echo "" - -log_warning "This will remove OSMO Control Plane and all OSMO resources" -read_prompt_var "Continue? (y/N)" confirm "" -if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then - log_info "Cancelled" - exit 0 -fi - -log_info "Removing OSMO Control Plane..." -kubectl delete deployment osmo-control-plane -n "${OSMO_NAMESPACE}" --ignore-not-found -kubectl delete service osmo-control-plane -n "${OSMO_NAMESPACE}" --ignore-not-found -kubectl delete secret osmo-database -n "${OSMO_NAMESPACE}" --ignore-not-found -kubectl delete secret osmo-storage -n "${OSMO_NAMESPACE}" --ignore-not-found - -log_info "Removing OSMO namespace..." -kubectl delete namespace "${OSMO_NAMESPACE}" --ignore-not-found - -log_success "OSMO Control Plane uninstalled" diff --git a/applications/osmo/deploy/002a-setup/defaults.sh b/applications/osmo/deploy/002a-setup/defaults.sh deleted file mode 100755 index 94aaa8e1b..000000000 --- a/applications/osmo/deploy/002a-setup/defaults.sh +++ /dev/null @@ -1,72 +0,0 @@ -# ============================================================================= -# Default Configuration for Setup Scripts -# ============================================================================= - -# Namespaces -export GPU_OPERATOR_NAMESPACE="gpu-operator" -export NETWORK_OPERATOR_NAMESPACE="network-operator" -export KAI_SCHEDULER_NAMESPACE="kai-scheduler" -export MONITORING_NAMESPACE="monitoring" -export OSMO_NAMESPACE="osmo" - -# Chart versions (leave empty for latest) -export GPU_OPERATOR_VERSION="" -export NETWORK_OPERATOR_VERSION="" -export KAI_SCHEDULER_VERSION="v0.12.4" # Check https://github.com/NVIDIA/KAI-Scheduler/releases -export PROMETHEUS_VERSION="" -export GRAFANA_VERSION="" -export LOKI_VERSION="" - -# GPU Operator settings -export GPU_DRIVER_ENABLED="false" # Use Nebius driver-full images -export TOOLKIT_ENABLED="true" -export DEVICE_PLUGIN_ENABLED="true" -export MIG_MANAGER_ENABLED="false" - -# Network Operator (only needed for InfiniBand/GPU clusters) -export ENABLE_NETWORK_OPERATOR="false" # Set to "true" if using InfiniBand - -# Observability settings -export PROMETHEUS_RETENTION_DAYS="15" -export LOKI_RETENTION_DAYS="7" -export GRAFANA_ADMIN_PASSWORD="" # Auto-generated if empty - -# NGINX Ingress Controller (deployed by 03-deploy-nginx-ingress.sh) -# Namespace where the NGINX Ingress Controller is deployed. -export INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}" -# Hostname for Ingress rules (e.g. osmo.example.com). Leave empty to use the LoadBalancer IP directly. -export OSMO_INGRESS_HOSTNAME="${OSMO_INGRESS_HOSTNAME:-}" -# Override for the service_base_url used by osmo-ctrl. Auto-detected from the ingress LoadBalancer if empty. -export OSMO_INGRESS_BASE_URL="${OSMO_INGRESS_BASE_URL:-}" - -# TLS / SSL Configuration -# Set OSMO_TLS_ENABLED=true after running 03a (certbot) or 03c (cert-manager). -export OSMO_TLS_ENABLED="${OSMO_TLS_ENABLED:-false}" -# Name of the Kubernetes TLS secret used by Ingress (both paths produce this secret). -# NOTE: The OSMO Helm chart generates ingress TLS with secretName "osmo-tls". -export OSMO_TLS_SECRET_NAME="${OSMO_TLS_SECRET_NAME:-osmo-tls}" -# Local directory where certbot stores certificate files (Path A only). -export OSMO_TLS_CERT_DIR="${OSMO_TLS_CERT_DIR:-$HOME/.osmo-certs}" -# Email for Let's Encrypt registration (required for 03a and 03c). -export LETSENCRYPT_EMAIL="${LETSENCRYPT_EMAIL:-}" -# cert-manager namespace (Path B / 03c only). -export CERT_MANAGER_NAMESPACE="${CERT_MANAGER_NAMESPACE:-cert-manager}" -# Name of the ClusterIssuer created by 03c (Path B only). -export CLUSTER_ISSUER_NAME="${CLUSTER_ISSUER_NAME:-letsencrypt-prod}" -# TLS mode: "certbot" or "cert-manager". Set automatically by 03a/03c. -export OSMO_TLS_MODE="${OSMO_TLS_MODE:-}" - -# Keycloak / Authentication -# Set DEPLOY_KEYCLOAK=true to deploy Keycloak and enable OSMO auth with Envoy sidecars. -export DEPLOY_KEYCLOAK="${DEPLOY_KEYCLOAK:-false}" -# Keycloak hostname (e.g. auth-osmo-nebius.csptst.nvidia.com). -# Auto-derived from OSMO_INGRESS_HOSTNAME if empty: auth-. -export KEYCLOAK_HOSTNAME="${KEYCLOAK_HOSTNAME:-}" -# TLS secret name for the Keycloak ingress (separate from the main osmo-tls). -# Run 03a with OSMO_TLS_SECRET_NAME=osmo-tls-auth for the auth subdomain. -export KEYCLOAK_TLS_SECRET_NAME="${KEYCLOAK_TLS_SECRET_NAME:-osmo-tls-auth}" - -# Paths -export SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -export VALUES_DIR="${SCRIPT_DIR}/values" -export LIB_DIR="${SCRIPT_DIR}/lib" diff --git a/applications/osmo/deploy/002a-setup/gpu_platform_update.json b/applications/osmo/deploy/002a-setup/gpu_platform_update.json deleted file mode 100755 index 56c0764fe..000000000 --- a/applications/osmo/deploy/002a-setup/gpu_platform_update.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "configs": { - "description": "GPU platform for L40S nodes", - "host_network_allowed": false, - "privileged_allowed": false, - "allowed_mounts": [], - "default_mounts": [], - "default_variables": { - "USER_GPU": 1 - }, - "resource_validations": [], - "override_pod_template": ["gpu_tolerations"] - } -} diff --git a/applications/osmo/deploy/002a-setup/gpu_pod_template.json b/applications/osmo/deploy/002a-setup/gpu_pod_template.json deleted file mode 100755 index ae651e3ba..000000000 --- a/applications/osmo/deploy/002a-setup/gpu_pod_template.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "configs": { - "spec": { - "tolerations": [ - { - "key": "nvidia.com/gpu", - "operator": "Exists", - "effect": "NoSchedule" - } - ], - "nodeSelector": { - "nvidia.com/gpu.present": "true" - } - } - } -} diff --git a/applications/osmo/deploy/002a-setup/lib/common.sh b/applications/osmo/deploy/002a-setup/lib/common.sh deleted file mode 100755 index 87abfb573..000000000 --- a/applications/osmo/deploy/002a-setup/lib/common.sh +++ /dev/null @@ -1,434 +0,0 @@ -#!/bin/bash -# -# Common functions for setup scripts -# - -# Colors -export RED='\033[0;31m' -export GREEN='\033[0;32m' -export YELLOW='\033[1;33m' -export BLUE='\033[0;34m' -export NC='\033[0m' - -# Logging functions -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[✓]${NC} $1" -} - -log_warning() { - echo -e "${YELLOW}[!]${NC} $1" -} - -log_error() { - echo -e "${RED}[✗]${NC} $1" -} - -# Read input with a prompt into a variable (bash/zsh compatible). -read_prompt_var() { - local prompt=$1 - local var_name=$2 - local default=$3 - local value="" - local read_from="/dev/tty" - local write_to="/dev/tty" - - if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then - read_from="/dev/stdin" - write_to="/dev/stdout" - fi - - if [[ -n "$default" ]]; then - printf "%s [%s]: " "$prompt" "$default" >"$write_to" - else - printf "%s: " "$prompt" >"$write_to" - fi - - IFS= read -r value <"$read_from" - if [[ -z "$value" && -n "$default" ]]; then - value="$default" - fi - - eval "$var_name='$value'" -} - -# Read a secret value into a variable (no echo). -read_secret_var() { - local prompt=$1 - local var_name=$2 - local value="" - local read_from="/dev/tty" - local write_to="/dev/tty" - - if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then - read_from="/dev/stdin" - write_to="/dev/stdout" - fi - - printf "%s: " "$prompt" >"$write_to" - stty -echo <"$read_from" - IFS= read -r value <"$read_from" - stty echo <"$read_from" - printf "\n" >"$write_to" - - eval "$var_name='$value'" -} - -# Check if command exists -check_command() { - command -v "$1" &>/dev/null -} - -# Retry with exponential backoff -retry_with_backoff() { - local max_attempts=${1:-5} - local delay=${2:-2} - local max_delay=${3:-60} - shift 3 - local cmd=("$@") - - local attempt=1 - while [[ $attempt -le $max_attempts ]]; do - log_info "Attempt $attempt/$max_attempts: ${cmd[*]}" - if "${cmd[@]}"; then - return 0 - fi - - if [[ $attempt -lt $max_attempts ]]; then - log_warning "Failed, retrying in ${delay}s..." - sleep "$delay" - delay=$((delay * 2)) - if [[ $delay -gt $max_delay ]]; then - delay=$max_delay - fi - fi - ((attempt++)) - done - - log_error "All $max_attempts attempts failed" - return 1 -} - -# Wait for a condition with timeout -wait_for_condition() { - local description=$1 - local timeout=${2:-300} - local interval=${3:-10} - shift 3 - local cmd=("$@") - - log_info "Waiting for $description (timeout: ${timeout}s)..." - - local elapsed=0 - while [[ $elapsed -lt $timeout ]]; do - if "${cmd[@]}" &>/dev/null; then - log_success "$description" - return 0 - fi - sleep "$interval" - ((elapsed += interval)) - echo -n "." - done - - echo "" - log_error "Timeout waiting for $description" - return 1 -} - -# Check kubectl connection -check_kubectl() { - if ! check_command kubectl; then - log_error "kubectl not found" - return 1 - fi - - if ! kubectl cluster-info &>/dev/null; then - log_error "Cannot connect to Kubernetes cluster" - return 1 - fi - - log_success "kubectl connected to cluster" - return 0 -} - -# Check Helm -check_helm() { - if ! check_command helm; then - log_error "helm not found" - return 1 - fi - - log_success "helm available" - return 0 -} - -# Install Helm chart with retry -helm_install() { - local name=$1 - local chart=$2 - local namespace=$3 - shift 3 - local extra_args=("$@") - - log_info "Installing Helm chart: $name" - - kubectl create namespace "$namespace" --dry-run=client -o yaml | kubectl apply -f - - - retry_with_backoff 3 5 30 helm upgrade --install "$name" "$chart" \ - --namespace "$namespace" \ - --wait --timeout 10m \ - "${extra_args[@]}" -} - -# Wait for pods to be ready -wait_for_pods() { - local namespace=$1 - local label_selector=$2 - local timeout=${3:-300} - - wait_for_condition "pods with label $label_selector in $namespace" \ - "$timeout" 10 \ - kubectl wait --for=condition=Ready pods \ - -n "$namespace" \ - -l "$label_selector" \ - --timeout=10s -} - -# Detect OSMO service URL from the NGINX Ingress Controller's LoadBalancer. -# -# When OSMO_TLS_ENABLED=true and OSMO_INGRESS_HOSTNAME is set, returns -# https://. Otherwise falls back to http://. -# -# Lookup order: -# 0. If TLS enabled + hostname set, return https:// immediately -# 1. LoadBalancer external IP (cloud assigns a public/internal IP) -# 2. LoadBalancer hostname (some clouds return a DNS name instead) -# 3. Controller ClusterIP (fallback – works from inside the cluster) -# -# Usage: -# url=$(detect_service_url) -# [[ -n "$url" ]] && echo "OSMO reachable at $url" -detect_service_url() { - local ns="${INGRESS_NAMESPACE:-ingress-nginx}" - local tls_enabled="${OSMO_TLS_ENABLED:-false}" - local hostname="${OSMO_INGRESS_HOSTNAME:-}" - local scheme="http" - - if [[ "$tls_enabled" == "true" ]]; then - scheme="https" - # If hostname is configured, prefer it (TLS certs are issued for the domain) - if [[ -n "$hostname" ]]; then - echo "${scheme}://${hostname}" - return 0 - fi - fi - - # Find the controller service (works for the community ingress-nginx chart) - local lb_ip lb_host cluster_ip svc_name - svc_name=$(kubectl get svc -n "$ns" \ - -l app.kubernetes.io/name=ingress-nginx,app.kubernetes.io/component=controller \ - -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) - - if [[ -n "$svc_name" ]]; then - # 1. LoadBalancer IP - lb_ip=$(kubectl get svc "$svc_name" -n "$ns" \ - -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) - if [[ -n "$lb_ip" ]]; then - echo "${scheme}://${lb_ip}" - return 0 - fi - - # 2. LoadBalancer hostname (e.g. ELB on AWS) - lb_host=$(kubectl get svc "$svc_name" -n "$ns" \ - -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || true) - if [[ -n "$lb_host" ]]; then - echo "${scheme}://${lb_host}" - return 0 - fi - - # 3. ClusterIP of the controller - cluster_ip=$(kubectl get svc "$svc_name" -n "$ns" \ - -o jsonpath='{.spec.clusterIP}' 2>/dev/null || true) - if [[ -n "$cluster_ip" && "$cluster_ip" != "None" ]]; then - echo "${scheme}://${cluster_ip}" - return 0 - fi - fi - - # Nothing found - return 1 -} - -# Get Terraform output (supports nested values like "postgresql.host") -get_tf_output() { - local name=$1 - local tf_dir=${2:-../001-iac} - - # Check if name contains a dot (nested value) - if [[ "$name" == *.* ]]; then - local base_name="${name%%.*}" - local key="${name#*.}" - terraform -chdir="$tf_dir" output -json "$base_name" 2>/dev/null | jq -r ".$key // empty" - else - terraform -chdir="$tf_dir" output -json "$name" 2>/dev/null | jq -r '. // empty' - fi -} - -# Get Nebius CLI path -get_nebius_path() { - if command -v nebius &>/dev/null; then - command -v nebius - elif [[ -x "$HOME/.nebius/bin/nebius" ]]; then - echo "$HOME/.nebius/bin/nebius" - fi -} - -# Read secret from Nebius MysteryBox -# Usage: get_mysterybox_secret -# Returns the secret value or empty string if not found -get_mysterybox_secret() { - local secret_id=$1 - local key=$2 - local nebius_path=$(get_nebius_path) - - if [[ -z "$nebius_path" ]]; then - log_warning "Nebius CLI not found, cannot read from MysteryBox" - return 1 - fi - - if [[ -z "$secret_id" ]]; then - return 1 - fi - - local result=$("$nebius_path" mysterybox v1 payload get-by-key \ - --secret-id "$secret_id" \ - --key "$key" \ - --format json 2>/dev/null) - - if [[ -n "$result" ]]; then - echo "$result" | jq -r '.data.string_value // empty' 2>/dev/null - fi -} - -# ----------------------------------------------------------------------------- -# OSMO API helpers (for use when Envoy auth sidecar is present) -# ----------------------------------------------------------------------------- -# Per OSMO documentation, the OSMO service authorises requests by reading -# the x-osmo-user and x-osmo-roles headers. Envoy normally sets these from -# the JWT but when we bypass Envoy (port-forward to pod:8000) we must set -# them ourselves. -# -# Reference: https://nvidia.github.io/OSMO/main/deployment_guide/appendix/authentication/authentication_flow.html - -# Detect if a pod has an Envoy sidecar container -# Usage: has_envoy_sidecar -# Returns 0 (true) if envoy container is found, 1 (false) otherwise -has_envoy_sidecar() { - local ns="$1" - local label="$2" - local pod_name - pod_name=$(kubectl get pod -n "$ns" -l "$label" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) - if [[ -z "$pod_name" ]]; then - return 1 - fi - kubectl get pod -n "$ns" "$pod_name" -o jsonpath='{.spec.containers[*].name}' 2>/dev/null | grep -q envoy -} - -# Start a port-forward that bypasses Envoy when the sidecar is present. -# Sets PORT_FORWARD_PID and prints log messages. -# Usage: start_osmo_port_forward [local_port] -start_osmo_port_forward() { - local ns="${1:-osmo}" - local local_port="${2:-8080}" - - if has_envoy_sidecar "$ns" "app=osmo-service"; then - local pod_name - pod_name=$(kubectl get pod -n "$ns" -l app=osmo-service -o jsonpath='{.items[0].metadata.name}') - log_info "Envoy sidecar detected -- port-forwarding to pod/${pod_name}:8000 (bypassing auth)..." - kubectl port-forward -n "$ns" "pod/${pod_name}" "${local_port}:8000" &>/dev/null & - _OSMO_AUTH_BYPASS=true - else - log_info "No Envoy sidecar -- port-forwarding to svc/osmo-service:80..." - kubectl port-forward -n "$ns" svc/osmo-service "${local_port}:80" &>/dev/null & - _OSMO_AUTH_BYPASS=false - fi - PORT_FORWARD_PID=$! - export _OSMO_AUTH_BYPASS -} - -# Make an authenticated curl call to the OSMO API. -# When _OSMO_AUTH_BYPASS=true (Envoy bypassed), injects x-osmo-user and -# x-osmo-roles headers so the OSMO service authorises the request. -# Usage: osmo_curl [curl-args...] -# Example: osmo_curl GET "http://localhost:8080/api/configs/service" -# Example: osmo_curl PATCH "http://localhost:8080/api/configs/service" -d '{"configs_dict":{...}}' -osmo_curl() { - local method="$1"; shift - local url="$1"; shift - - local auth_args=() - if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then - auth_args+=(-H "x-osmo-user: osmo-admin" -H "x-osmo-roles: osmo-admin,osmo-user") - fi - - curl -s -X "$method" "$url" \ - -H "Content-Type: application/json" \ - "${auth_args[@]}" \ - "$@" -} - -# Log in to OSMO using the appropriate method. -# When bypassing Envoy this is a no-op (curl headers handle auth). -# Otherwise uses `osmo login --method dev`. -# Usage: osmo_login [port] -osmo_login() { - local port="${1:-8080}" - if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then - log_info "Auth bypass active -- using direct API headers (osmo-admin role)" - else - log_info "Logging in to OSMO..." - if ! osmo login "http://localhost:${port}" --method dev --username admin 2>/dev/null; then - log_error "Failed to login to OSMO" - return 1 - fi - log_success "Logged in successfully" - fi -} - -# Update an OSMO config via the PATCH API (partial merge). -# When _OSMO_AUTH_BYPASS=true, uses curl; otherwise uses osmo CLI. -# Usage: osmo_config_update -# Example: osmo_config_update WORKFLOW /tmp/config.json "Configure storage" -osmo_config_update() { - local config_type="$1" - local json_file="$2" - local description="${3:-Update config}" - local port="${4:-8080}" - - if [[ "${_OSMO_AUTH_BYPASS:-false}" == "true" ]]; then - local endpoint - endpoint="api/configs/$(echo "$config_type" | tr '[:upper:]' '[:lower:]')" - - # Build PATCH request body: {"description": "...", "configs_dict": } - local body - body=$(jq -n --arg desc "$description" --slurpfile cfg "$json_file" \ - '{description: $desc, configs_dict: $cfg[0]}') - - local http_code - http_code=$(osmo_curl PATCH "http://localhost:${port}/${endpoint}" \ - -d "$body" -o /tmp/_osmo_patch_resp.txt -w "%{http_code}") - - if [[ "$http_code" =~ ^2 ]]; then - return 0 - else - log_error "PATCH /${endpoint} returned HTTP ${http_code}" - cat /tmp/_osmo_patch_resp.txt 2>/dev/null || true - return 1 - fi - else - osmo config update "$config_type" --file "$json_file" --description "$description" 2>/dev/null - fi -} diff --git a/applications/osmo/deploy/002a-setup/osmo-values-noauth.yaml b/applications/osmo/deploy/002a-setup/osmo-values-noauth.yaml deleted file mode 100755 index 53eb46662..000000000 --- a/applications/osmo/deploy/002a-setup/osmo-values-noauth.yaml +++ /dev/null @@ -1,170 +0,0 @@ -# OSMO Service values - Auth Disabled -# For testing without authentication - -global: - osmoImageLocation: nvcr.io/nvidia/osmo - osmoImageTag: latest - imagePullPolicy: IfNotPresent - -services: - postgres: - enabled: false - serviceName: postgresql.osmo.svc.cluster.local - port: 5432 - db: osmo - user: osmo_admin - passwordSecretName: postgres-secret - passwordSecretKey: password - - redis: - enabled: false - serviceName: redis-master.osmo.svc.cluster.local - port: 6379 - tlsEnabled: false - - service: - scaling: - minReplicas: 1 - maxReplicas: 1 - ingress: - enabled: false - auth: - enabled: false - extraEnv: - - name: OSMO_POSTGRES_HOST - value: postgresql.osmo.svc.cluster.local - - name: OSMO_POSTGRES_PORT - value: "5432" - - name: OSMO_POSTGRES_USER - value: osmo_admin - - name: OSMO_POSTGRES_DATABASE - value: osmo - - name: OSMO_POSTGRES_PASSWORD - valueFrom: - secretKeyRef: - name: postgres-secret - key: password - extraVolumes: - - name: vault-secrets - secret: - secretName: vault-secrets - extraVolumeMounts: - - name: vault-secrets - mountPath: /home/osmo/vault-agent/secrets - readOnly: true - - worker: - scaling: - minReplicas: 1 - maxReplicas: 1 - extraEnv: - - name: OSMO_POSTGRES_HOST - value: postgresql.osmo.svc.cluster.local - - name: OSMO_POSTGRES_PORT - value: "5432" - - name: OSMO_POSTGRES_USER - value: osmo_admin - - name: OSMO_POSTGRES_DATABASE - value: osmo - - name: OSMO_POSTGRES_PASSWORD - valueFrom: - secretKeyRef: - name: postgres-secret - key: password - extraVolumes: - - name: vault-secrets - secret: - secretName: vault-secrets - extraVolumeMounts: - - name: vault-secrets - mountPath: /home/osmo/vault-agent/secrets - readOnly: true - - logger: - scaling: - minReplicas: 1 - maxReplicas: 1 - extraEnv: - - name: OSMO_POSTGRES_HOST - value: postgresql.osmo.svc.cluster.local - - name: OSMO_POSTGRES_PORT - value: "5432" - - name: OSMO_POSTGRES_USER - value: osmo_admin - - name: OSMO_POSTGRES_DATABASE - value: osmo - - name: OSMO_POSTGRES_PASSWORD - valueFrom: - secretKeyRef: - name: postgres-secret - key: password - extraVolumes: - - name: vault-secrets - secret: - secretName: vault-secrets - extraVolumeMounts: - - name: vault-secrets - mountPath: /home/osmo/vault-agent/secrets - readOnly: true - - agent: - scaling: - minReplicas: 1 - maxReplicas: 1 - extraEnv: - - name: OSMO_POSTGRES_HOST - value: postgresql.osmo.svc.cluster.local - - name: OSMO_POSTGRES_PORT - value: "5432" - - name: OSMO_POSTGRES_USER - value: osmo_admin - - name: OSMO_POSTGRES_DATABASE - value: osmo - - name: OSMO_POSTGRES_PASSWORD - valueFrom: - secretKeyRef: - name: postgres-secret - key: password - extraVolumes: - - name: vault-secrets - secret: - secretName: vault-secrets - extraVolumeMounts: - - name: vault-secrets - mountPath: /home/osmo/vault-agent/secrets - readOnly: true - - delayedJobMonitor: - replicas: 1 - extraEnv: - - name: OSMO_POSTGRES_HOST - value: postgresql.osmo.svc.cluster.local - - name: OSMO_POSTGRES_PORT - value: "5432" - - name: OSMO_POSTGRES_USER - value: osmo_admin - - name: OSMO_POSTGRES_DATABASE - value: osmo - - name: OSMO_POSTGRES_PASSWORD - valueFrom: - secretKeyRef: - name: postgres-secret - key: password - extraVolumes: - - name: vault-secrets - secret: - secretName: vault-secrets - extraVolumeMounts: - - name: vault-secrets - mountPath: /home/osmo/vault-agent/secrets - readOnly: true - -sidecars: - envoy: - enabled: false - rateLimit: - enabled: false - logAgent: - enabled: false - otel: - enabled: false diff --git a/applications/osmo/deploy/002a-setup/sample_osmo_realm.json b/applications/osmo/deploy/002a-setup/sample_osmo_realm.json deleted file mode 100755 index 54a65ed77..000000000 --- a/applications/osmo/deploy/002a-setup/sample_osmo_realm.json +++ /dev/null @@ -1,2636 +0,0 @@ -{ - "id": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", - "realm": "osmo", - "notBefore": 0, - "defaultSignatureAlgorithm": "RS256", - "revokeRefreshToken": false, - "refreshTokenMaxReuse": 0, - "accessTokenLifespan": 300, - "accessTokenLifespanForImplicitFlow": 900, - "ssoSessionIdleTimeout": 604800, - "ssoSessionMaxLifespan": 604800, - "ssoSessionIdleTimeoutRememberMe": 0, - "ssoSessionMaxLifespanRememberMe": 0, - "offlineSessionIdleTimeout": 2592000, - "offlineSessionMaxLifespanEnabled": false, - "offlineSessionMaxLifespan": 5184000, - "clientSessionIdleTimeout": 0, - "clientSessionMaxLifespan": 0, - "clientOfflineSessionIdleTimeout": 0, - "clientOfflineSessionMaxLifespan": 0, - "accessCodeLifespan": 60, - "accessCodeLifespanUserAction": 300, - "accessCodeLifespanLogin": 1800, - "actionTokenGeneratedByAdminLifespan": 43200, - "actionTokenGeneratedByUserLifespan": 300, - "oauth2DeviceCodeLifespan": 600, - "oauth2DevicePollingInterval": 5, - "enabled": true, - "sslRequired": "external", - "registrationAllowed": false, - "registrationEmailAsUsername": false, - "rememberMe": false, - "verifyEmail": false, - "loginWithEmailAllowed": false, - "duplicateEmailsAllowed": false, - "resetPasswordAllowed": false, - "editUsernameAllowed": false, - "bruteForceProtected": true, - "permanentLockout": false, - "maxTemporaryLockouts": 0, - "bruteForceStrategy": "MULTIPLE", - "maxFailureWaitSeconds": 300, - "minimumQuickLoginWaitSeconds": 60, - "waitIncrementSeconds": 60, - "quickLoginCheckMilliSeconds": 1000, - "maxDeltaTimeSeconds": 43200, - "failureFactor": 30, - "roles": { - "realm": [ - { - "id": "2fbf71d8-d3c1-4de3-8c08-ae55b254e092", - "name": "uma_authorization", - "description": "${role_uma_authorization}", - "composite": false, - "clientRole": false, - "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", - "attributes": {} - }, - { - "id": "e22b93a7-88eb-4f66-a5cc-7c68a35d72fb", - "name": "offline_access", - "description": "${role_offline-access}", - "composite": false, - "clientRole": false, - "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", - "attributes": {} - }, - { - "id": "c3d524ce-b3c8-42fd-9e6b-777a32960bb2", - "name": "admin", - "description": "${role_admin}", - "composite": true, - "composites": { - "realm": [ - "create-realm" - ], - "client": { - "realm-management": [ - "manage-realm", - "query-clients", - "view-users", - "manage-identity-providers", - "impersonation", - "view-events", - "manage-authorization", - "query-realms", - "manage-clients", - "view-clients", - "create-client", - "query-groups", - "view-identity-providers", - "view-realm", - "view-authorization", - "manage-users", - "query-users", - "manage-events" - ] - } - }, - "clientRole": false, - "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", - "attributes": {} - }, - { - "id": "996ba034-02ae-40d4-8d14-735506151057", - "name": "default-roles-osmo", - "description": "${role_default-roles}", - "composite": true, - "composites": { - "realm": [ - "offline_access", - "uma_authorization" - ], - "client": { - "account": [ - "manage-account", - "view-profile" - ] - } - }, - "clientRole": false, - "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", - "attributes": {} - }, - { - "id": "f5584dff-7c44-4204-b387-e3caf8ea3f46", - "name": "create-realm", - "description": "${role_create-realm}", - "composite": false, - "clientRole": false, - "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c", - "attributes": {} - } - ], - "client": { - "osmo-realm": [], - "realm-management": [ - { - "id": "b8b96d4c-fc77-4e20-bc64-4918144dfdcf", - "name": "manage-realm", - "description": "${role_manage-realm}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "1dbd9f8f-e5e6-41b3-ba7c-746835fd9b79", - "name": "query-clients", - "description": "${role_query-clients}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "d27fc846-afad-42f9-8b11-636f4c535a36", - "name": "view-users", - "description": "${role_view-users}", - "composite": true, - "composites": { - "client": { - "realm-management": [ - "query-groups", - "query-users" - ] - } - }, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "3c345b77-4bdb-4360-bf81-fe85a77cbff7", - "name": "manage-identity-providers", - "description": "${role_manage-identity-providers}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "4953639a-2db7-45d7-a734-c42b487647c5", - "name": "impersonation", - "description": "${role_impersonation}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "ae14995a-6e23-4b1d-a10d-dd0feebf1d4a", - "name": "view-events", - "description": "${role_view-events}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "5ae16954-f8ad-4237-be92-1eb6916ce6cb", - "name": "manage-authorization", - "description": "${role_manage-authorization}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "7663ba0a-60f3-46bb-9232-3a2cc1832e62", - "name": "query-realms", - "description": "${role_query-realms}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "121f50ad-06c7-4541-a40f-400710228515", - "name": "manage-clients", - "description": "${role_manage-clients}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "d8c6a12c-240c-415c-9299-30f5292d2b90", - "name": "view-clients", - "description": "${role_view-clients}", - "composite": true, - "composites": { - "client": { - "realm-management": [ - "query-clients" - ] - } - }, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "70ebf14f-cf79-4ad7-b4c4-3d5289288ec0", - "name": "create-client", - "description": "${role_create-client}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "1abf94ab-c2a7-469c-b081-584fbbb66046", - "name": "query-groups", - "description": "${role_query-groups}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "f8e1d204-7b77-446a-84fb-675c8c85e1f1", - "name": "realm-admin", - "description": "${role_realm-admin}", - "composite": true, - "composites": { - "client": { - "realm-management": [ - "manage-realm", - "query-clients", - "view-users", - "manage-identity-providers", - "impersonation", - "view-events", - "manage-authorization", - "query-realms", - "manage-clients", - "view-clients", - "create-client", - "query-groups", - "view-identity-providers", - "view-realm", - "view-authorization", - "manage-users", - "query-users", - "manage-events" - ] - } - }, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "72066e7f-f80f-4008-a0b3-531d3aebd2f0", - "name": "view-identity-providers", - "description": "${role_view-identity-providers}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "29649597-fdc9-4330-a96d-94218a1e91b2", - "name": "view-realm", - "description": "${role_view-realm}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "12c80e9d-c3d9-4e61-91ab-c986e3aafe48", - "name": "view-authorization", - "description": "${role_view-authorization}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "bde16849-39b1-4c85-985d-40e9a178e873", - "name": "manage-users", - "description": "${role_manage-users}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "62463d22-8113-41e0-af6a-fa81883c475d", - "name": "query-users", - "description": "${role_query-users}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - }, - { - "id": "e1afbd19-239f-4e78-abd9-5019b6baa7e2", - "name": "manage-events", - "description": "${role_manage-events}", - "composite": false, - "clientRole": true, - "containerId": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "attributes": {} - } - ], - "osmo-browser-flow": [ - { - "id": "2cfce9e9-000e-4de8-a0b6-50f0a4252db3", - "name": "dashboard-admin", - "description": "Able to make change to the kubernetes dashboard", - "composite": false, - "clientRole": true, - "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", - "attributes": {} - }, - { - "id": "454726d1-4f76-47f6-bcfa-5d64f759134f", - "name": "grafana-user", - "description": "Able to view dashboards in grafana", - "composite": false, - "clientRole": true, - "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", - "attributes": {} - }, - { - "id": "9d91ae54-e69b-46e8-baee-7a16f044ded1", - "name": "osmo-user", - "description": "A regular user of osmo who can submit and query workflows and datasets", - "composite": false, - "clientRole": true, - "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", - "attributes": {} - }, - { - "id": "9ec3a04d-49a4-414b-9e2f-35b70bbea18b", - "name": "dashboard-user", - "description": "Able to view the kubernetes dashboard", - "composite": false, - "clientRole": true, - "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", - "attributes": {} - }, - { - "id": "dfd62581-88c7-4ebb-beac-7555d1aef105", - "name": "grafana-admin", - "description": "", - "composite": false, - "clientRole": true, - "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", - "attributes": {} - }, - { - "id": "aa86ac92-9df4-499c-9f78-e3ed600ddc15", - "name": "osmo-admin", - "description": "Admin access to the osmo service", - "composite": false, - "clientRole": true, - "containerId": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", - "attributes": {} - } - ], - "security-admin-console": [], - "admin-cli": [], - "account-console": [], - "broker": [ - { - "id": "44300967-5867-4c57-a59a-5b8302cb8323", - "name": "read-token", - "description": "${role_read-token}", - "composite": false, - "clientRole": true, - "containerId": "6fdf7b8e-1146-4dd9-a3dc-dd93e877cf2a", - "attributes": {} - } - ], - "osmo-device": [ - { - "id": "e126038f-20eb-4d31-a95b-e5267eb8c7f1", - "name": "osmo-user", - "description": "", - "composite": false, - "clientRole": true, - "containerId": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", - "attributes": {} - }, - { - "id": "20874405-f96b-456b-a3b8-86cfe8740144", - "name": "osmo-admin", - "description": "Admin access to the osmo service", - "composite": false, - "clientRole": true, - "containerId": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", - "attributes": {} - }, - { - "id": "94a41f7f-9927-489f-aa76-a9e3dafb4ed5", - "name": "osmo-backend", - "description": "", - "composite": false, - "clientRole": true, - "containerId": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", - "attributes": {} - } - ], - "account": [ - { - "id": "358c4e88-41b8-458b-83d9-e4c86a357095", - "name": "manage-account-links", - "description": "${role_manage-account-links}", - "composite": false, - "clientRole": true, - "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", - "attributes": {} - }, - { - "id": "499f54a7-ccc5-4fef-bece-9ccdc6a80308", - "name": "manage-consent", - "description": "${role_manage-consent}", - "composite": true, - "composites": { - "client": { - "account": [ - "view-consent" - ] - } - }, - "clientRole": true, - "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", - "attributes": {} - }, - { - "id": "f14ea475-e733-4f69-8475-693da2992a72", - "name": "view-applications", - "description": "${role_view-applications}", - "composite": false, - "clientRole": true, - "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", - "attributes": {} - }, - { - "id": "aea168f8-7115-468b-9118-aae87937dee9", - "name": "view-consent", - "description": "${role_view-consent}", - "composite": false, - "clientRole": true, - "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", - "attributes": {} - }, - { - "id": "47acd969-e55d-4382-946b-67fb2e4bb119", - "name": "manage-account", - "description": "${role_manage-account}", - "composite": true, - "composites": { - "client": { - "account": [ - "manage-account-links" - ] - } - }, - "clientRole": true, - "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", - "attributes": {} - }, - { - "id": "102cd4a5-8e95-4d3c-87de-a98c2958f5c0", - "name": "view-groups", - "description": "${role_view-groups}", - "composite": false, - "clientRole": true, - "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", - "attributes": {} - }, - { - "id": "b6da542f-977e-437e-8d24-6cb4ed4612af", - "name": "delete-account", - "description": "${role_delete-account}", - "composite": false, - "clientRole": true, - "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", - "attributes": {} - }, - { - "id": "2da758ad-a74d-43ef-b911-6b52c8b60d90", - "name": "view-profile", - "description": "${role_view-profile}", - "composite": false, - "clientRole": true, - "containerId": "049b45a3-ba14-4735-8168-c9be73625a6f", - "attributes": {} - } - ] - } - }, - "groups": [ - { - "id": "979a1cd5-b392-4905-a868-17603faf9ca9", - "name": "Admin", - "path": "/Admin", - "subGroups": [], - "attributes": {}, - "realmRoles": [], - "clientRoles": { - "osmo-browser-flow": [ - "osmo-user", - "osmo-admin" - ], - "osmo-device": [ - "osmo-user", - "osmo-admin" - ] - } - }, - { - "id": "2fc39861-b636-47c8-b57b-d1719466759c", - "name": "Backend Operator", - "path": "/Backend Operator", - "subGroups": [], - "attributes": {}, - "realmRoles": [], - "clientRoles": { - "osmo-device": [ - "osmo-backend" - ] - } - }, - { - "id": "57a9b7f0-36ec-46c5-9781-49d53b1c6468", - "name": "User", - "path": "/User", - "subGroups": [], - "attributes": {}, - "realmRoles": [], - "clientRoles": { - "osmo-browser-flow": [ - "osmo-user", - "grafana-user", - "dashboard-user" - ], - "osmo-device": [ - "osmo-user" - ] - } - } - ], - "defaultRole": { - "id": "996ba034-02ae-40d4-8d14-735506151057", - "name": "default-roles-osmo", - "description": "${role_default-roles}", - "composite": true, - "clientRole": false, - "containerId": "dfee7915-9b8c-4f7a-b7dd-465663999f1c" - }, - "requiredCredentials": [ - "password" - ], - "otpPolicyType": "totp", - "otpPolicyAlgorithm": "HmacSHA1", - "otpPolicyInitialCounter": 0, - "otpPolicyDigits": 6, - "otpPolicyLookAheadWindow": 1, - "otpPolicyPeriod": 30, - "otpPolicyCodeReusable": false, - "otpSupportedApplications": [ - "totpAppFreeOTPName", - "totpAppGoogleName", - "totpAppMicrosoftAuthenticatorName" - ], - "localizationTexts": {}, - "webAuthnPolicyRpEntityName": "keycloak", - "webAuthnPolicySignatureAlgorithms": [ - "ES256" - ], - "webAuthnPolicyRpId": "", - "webAuthnPolicyAttestationConveyancePreference": "not specified", - "webAuthnPolicyAuthenticatorAttachment": "not specified", - "webAuthnPolicyRequireResidentKey": "not specified", - "webAuthnPolicyUserVerificationRequirement": "not specified", - "webAuthnPolicyCreateTimeout": 0, - "webAuthnPolicyAvoidSameAuthenticatorRegister": false, - "webAuthnPolicyAcceptableAaguids": [], - "webAuthnPolicyExtraOrigins": [], - "webAuthnPolicyPasswordlessRpEntityName": "keycloak", - "webAuthnPolicyPasswordlessSignatureAlgorithms": [ - "ES256" - ], - "webAuthnPolicyPasswordlessRpId": "", - "webAuthnPolicyPasswordlessAttestationConveyancePreference": "not specified", - "webAuthnPolicyPasswordlessAuthenticatorAttachment": "not specified", - "webAuthnPolicyPasswordlessRequireResidentKey": "not specified", - "webAuthnPolicyPasswordlessUserVerificationRequirement": "not specified", - "webAuthnPolicyPasswordlessCreateTimeout": 0, - "webAuthnPolicyPasswordlessAvoidSameAuthenticatorRegister": false, - "webAuthnPolicyPasswordlessAcceptableAaguids": [], - "webAuthnPolicyPasswordlessExtraOrigins": [], - "scopeMappings": [ - { - "clientScope": "offline_access", - "roles": [ - "offline_access" - ] - } - ], - "clientScopeMappings": { - "account": [ - { - "client": "account-console", - "roles": [ - "manage-account", - "view-groups" - ] - } - ] - }, - "clients": [ - { - "id": "049b45a3-ba14-4735-8168-c9be73625a6f", - "clientId": "account", - "name": "${client_account}", - "rootUrl": "${authBaseUrl}", - "baseUrl": "/realms/osmo/account/", - "surrogateAuthRequired": false, - "enabled": true, - "alwaysDisplayInConsole": false, - "clientAuthenticatorType": "client-secret", - "redirectUris": [ - "/realms/osmo/account/*" - ], - "webOrigins": [], - "notBefore": 0, - "bearerOnly": false, - "consentRequired": false, - "standardFlowEnabled": true, - "implicitFlowEnabled": false, - "directAccessGrantsEnabled": false, - "serviceAccountsEnabled": false, - "publicClient": true, - "frontchannelLogout": false, - "protocol": "openid-connect", - "attributes": { - "realm_client": "false", - "post.logout.redirect.uris": "+" - }, - "authenticationFlowBindingOverrides": {}, - "fullScopeAllowed": false, - "nodeReRegistrationTimeout": 0, - "defaultClientScopes": [ - "web-origins", - "acr", - "profile", - "roles", - "basic", - "email" - ], - "optionalClientScopes": [ - "address", - "phone", - "offline_access", - "microprofile-jwt" - ] - }, - { - "id": "a18dadb1-a13d-4523-8e33-446ff5781676", - "clientId": "account-console", - "name": "${client_account-console}", - "rootUrl": "${authBaseUrl}", - "baseUrl": "/realms/osmo/account/", - "surrogateAuthRequired": false, - "enabled": true, - "alwaysDisplayInConsole": false, - "clientAuthenticatorType": "client-secret", - "redirectUris": [ - "/realms/osmo/account/*" - ], - "webOrigins": [], - "notBefore": 0, - "bearerOnly": false, - "consentRequired": false, - "standardFlowEnabled": true, - "implicitFlowEnabled": false, - "directAccessGrantsEnabled": false, - "serviceAccountsEnabled": false, - "publicClient": true, - "frontchannelLogout": false, - "protocol": "openid-connect", - "attributes": { - "realm_client": "false", - "post.logout.redirect.uris": "+", - "pkce.code.challenge.method": "S256" - }, - "authenticationFlowBindingOverrides": {}, - "fullScopeAllowed": false, - "nodeReRegistrationTimeout": 0, - "protocolMappers": [ - { - "id": "d3db99fd-64a1-48b8-82bd-a92533e2fd4c", - "name": "audience resolve", - "protocol": "openid-connect", - "protocolMapper": "oidc-audience-resolve-mapper", - "consentRequired": false, - "config": {} - } - ], - "defaultClientScopes": [ - "web-origins", - "acr", - "profile", - "roles", - "basic", - "email" - ], - "optionalClientScopes": [ - "address", - "phone", - "offline_access", - "microprofile-jwt" - ] - }, - { - "id": "14047566-1501-4403-92c7-418ef38e3ba4", - "clientId": "admin-cli", - "name": "${client_admin-cli}", - "surrogateAuthRequired": false, - "enabled": true, - "alwaysDisplayInConsole": false, - "clientAuthenticatorType": "client-secret", - "redirectUris": [], - "webOrigins": [], - "notBefore": 0, - "bearerOnly": false, - "consentRequired": false, - "standardFlowEnabled": false, - "implicitFlowEnabled": false, - "directAccessGrantsEnabled": true, - "serviceAccountsEnabled": false, - "publicClient": true, - "frontchannelLogout": false, - "protocol": "openid-connect", - "attributes": { - "realm_client": "false", - "client.use.lightweight.access.token.enabled": "true", - "post.logout.redirect.uris": "+" - }, - "authenticationFlowBindingOverrides": {}, - "fullScopeAllowed": true, - "nodeReRegistrationTimeout": 0, - "defaultClientScopes": [ - "web-origins", - "acr", - "profile", - "roles", - "basic", - "email" - ], - "optionalClientScopes": [ - "address", - "phone", - "offline_access", - "microprofile-jwt" - ] - }, - { - "id": "6fdf7b8e-1146-4dd9-a3dc-dd93e877cf2a", - "clientId": "broker", - "name": "${client_broker}", - "surrogateAuthRequired": false, - "enabled": true, - "alwaysDisplayInConsole": false, - "clientAuthenticatorType": "client-secret", - "redirectUris": [], - "webOrigins": [], - "notBefore": 0, - "bearerOnly": true, - "consentRequired": false, - "standardFlowEnabled": true, - "implicitFlowEnabled": false, - "directAccessGrantsEnabled": false, - "serviceAccountsEnabled": false, - "publicClient": false, - "frontchannelLogout": false, - "protocol": "openid-connect", - "attributes": { - "realm_client": "true", - "post.logout.redirect.uris": "+" - }, - "authenticationFlowBindingOverrides": {}, - "fullScopeAllowed": false, - "nodeReRegistrationTimeout": 0, - "defaultClientScopes": [ - "web-origins", - "acr", - "profile", - "roles", - "email" - ], - "optionalClientScopes": [ - "address", - "phone", - "offline_access", - "microprofile-jwt" - ] - }, - { - "id": "76cd6c90-2b1a-40d8-a1d4-d2469859cac5", - "clientId": "osmo-browser-flow", - "name": "Osmo Browser Flow", - "description": "Allow logging into osmo using the authorization code based browser flow", - "rootUrl": "https://default.com", - "adminUrl": "https://default.com", - "baseUrl": "https://default.com/docs", - "surrogateAuthRequired": false, - "enabled": true, - "alwaysDisplayInConsole": false, - "clientAuthenticatorType": "client-secret", - "secret": "**********", - "redirectUris": [ - "", - "https://default.com/setup/getAToken", - "https://default.com/getAToken", - "https://default.com/api/auth/getAToken" - ], - "webOrigins": [ - "*", - "https://default.com" - ], - "notBefore": 0, - "bearerOnly": false, - "consentRequired": false, - "standardFlowEnabled": true, - "implicitFlowEnabled": false, - "directAccessGrantsEnabled": false, - "serviceAccountsEnabled": false, - "publicClient": false, - "frontchannelLogout": true, - "protocol": "openid-connect", - "attributes": { - "client.secret.creation.time": "1762965594", - "post.logout.redirect.uris": "+", - "frontchannel.logout.session.required": "true", - "oauth2.device.authorization.grant.enabled": "false", - "backchannel.logout.revoke.offline.tokens": "false", - "use.refresh.tokens": "true", - "realm_client": "false", - "oidc.ciba.grant.enabled": "false", - "backchannel.logout.session.required": "true", - "client_credentials.use_refresh_token": "false", - "acr.loa.map": "{}", - "require.pushed.authorization.requests": "false", - "tls.client.certificate.bound.access.tokens": "false", - "display.on.consent.screen": "false", - "token.response.type.bearer.lower-case": "false" - }, - "authenticationFlowBindingOverrides": {}, - "fullScopeAllowed": true, - "nodeReRegistrationTimeout": -1, - "protocolMappers": [ - { - "id": "8fcbb19c-503b-4173-a35b-69cc23bc112f", - "name": "Create \"roles\" claim", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-client-role-mapper", - "consentRequired": false, - "config": { - "multivalued": "true", - "userinfo.token.claim": "true", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "roles", - "jsonType.label": "String", - "usermodel.clientRoleMapping.clientId": "osmo-browser-flow" - } - } - ], - "defaultClientScopes": [ - "web-origins", - "acr", - "profile", - "roles", - "basic", - "email" - ], - "optionalClientScopes": [ - "address", - "phone", - "offline_access", - "microprofile-jwt" - ] - }, - { - "id": "82b4ce74-5ff3-440f-b2f6-2c7786230e35", - "clientId": "osmo-device", - "name": "Osmo device flow", - "description": "Allow login with devices such as cli", - "rootUrl": "https://default.com", - "adminUrl": "https://default.com", - "baseUrl": "https://default.com", - "surrogateAuthRequired": false, - "enabled": true, - "alwaysDisplayInConsole": false, - "clientAuthenticatorType": "client-secret", - "redirectUris": [ - "https://default.com/*" - ], - "webOrigins": [ - "https://default.com" - ], - "notBefore": 0, - "bearerOnly": false, - "consentRequired": false, - "standardFlowEnabled": false, - "implicitFlowEnabled": false, - "directAccessGrantsEnabled": true, - "serviceAccountsEnabled": false, - "publicClient": true, - "frontchannelLogout": true, - "protocol": "openid-connect", - "attributes": { - "realm_client": "false", - "oidc.ciba.grant.enabled": "false", - "backchannel.logout.session.required": "true", - "post.logout.redirect.uris": "+", - "frontchannel.logout.session.required": "true", - "display.on.consent.screen": "false", - "oauth2.device.authorization.grant.enabled": "true", - "backchannel.logout.revoke.offline.tokens": "false" - }, - "authenticationFlowBindingOverrides": {}, - "fullScopeAllowed": true, - "nodeReRegistrationTimeout": -1, - "protocolMappers": [ - { - "id": "21f8be09-ffc5-4a26-855b-6be4ab297c67", - "name": "Create \"roles\" claim", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-client-role-mapper", - "consentRequired": false, - "config": { - "multivalued": "true", - "userinfo.token.claim": "true", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "roles", - "jsonType.label": "String", - "usermodel.clientRoleMapping.clientId": "osmo-device" - } - } - ], - "defaultClientScopes": [ - "web-origins", - "acr", - "profile", - "roles", - "basic", - "email" - ], - "optionalClientScopes": [ - "address", - "phone", - "offline_access", - "microprofile-jwt" - ] - }, - { - "id": "06a0fe4b-c247-4233-af67-78138bf5337a", - "clientId": "osmo-realm", - "name": "OSMO Realm", - "description": "", - "surrogateAuthRequired": false, - "enabled": true, - "alwaysDisplayInConsole": false, - "clientAuthenticatorType": "client-secret", - "redirectUris": [], - "webOrigins": [], - "notBefore": 0, - "bearerOnly": true, - "consentRequired": false, - "standardFlowEnabled": true, - "implicitFlowEnabled": false, - "directAccessGrantsEnabled": false, - "serviceAccountsEnabled": false, - "publicClient": false, - "frontchannelLogout": false, - "protocol": "openid-connect", - "attributes": { - "realm_client": "false", - "oidc.ciba.grant.enabled": "false", - "backchannel.logout.session.required": "true", - "post.logout.redirect.uris": "+", - "oauth2.device.authorization.grant.enabled": "false", - "backchannel.logout.revoke.offline.tokens": "false" - }, - "authenticationFlowBindingOverrides": {}, - "fullScopeAllowed": false, - "nodeReRegistrationTimeout": 0, - "defaultClientScopes": [], - "optionalClientScopes": [] - }, - { - "id": "6cf7e90d-f7a3-47ea-9631-fc7fb19b6fbc", - "clientId": "realm-management", - "name": "${client_realm-management}", - "surrogateAuthRequired": false, - "enabled": true, - "alwaysDisplayInConsole": false, - "clientAuthenticatorType": "client-secret", - "redirectUris": [], - "webOrigins": [], - "notBefore": 0, - "bearerOnly": true, - "consentRequired": false, - "standardFlowEnabled": true, - "implicitFlowEnabled": false, - "directAccessGrantsEnabled": false, - "serviceAccountsEnabled": false, - "publicClient": false, - "frontchannelLogout": false, - "protocol": "openid-connect", - "attributes": { - "realm_client": "true", - "post.logout.redirect.uris": "+" - }, - "authenticationFlowBindingOverrides": {}, - "fullScopeAllowed": false, - "nodeReRegistrationTimeout": 0, - "defaultClientScopes": [ - "web-origins", - "acr", - "profile", - "roles", - "email" - ], - "optionalClientScopes": [ - "address", - "phone", - "offline_access", - "microprofile-jwt" - ] - }, - { - "id": "c70e9b76-96a2-41da-84da-df8b9e0d228d", - "clientId": "security-admin-console", - "name": "${client_security-admin-console}", - "rootUrl": "${authAdminUrl}", - "baseUrl": "/admin/osmo/console/", - "surrogateAuthRequired": false, - "enabled": true, - "alwaysDisplayInConsole": false, - "clientAuthenticatorType": "client-secret", - "redirectUris": [ - "/admin/osmo/console/*" - ], - "webOrigins": [ - "+" - ], - "notBefore": 0, - "bearerOnly": false, - "consentRequired": false, - "standardFlowEnabled": true, - "implicitFlowEnabled": false, - "directAccessGrantsEnabled": false, - "serviceAccountsEnabled": false, - "publicClient": true, - "frontchannelLogout": false, - "protocol": "openid-connect", - "attributes": { - "realm_client": "false", - "client.use.lightweight.access.token.enabled": "true", - "post.logout.redirect.uris": "+", - "pkce.code.challenge.method": "S256" - }, - "authenticationFlowBindingOverrides": {}, - "fullScopeAllowed": true, - "nodeReRegistrationTimeout": 0, - "protocolMappers": [ - { - "id": "e921764f-2d7f-4a08-833c-204801a096db", - "name": "locale", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "locale", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "locale", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - } - ], - "defaultClientScopes": [ - "web-origins", - "acr", - "profile", - "roles", - "basic", - "email" - ], - "optionalClientScopes": [ - "address", - "phone", - "offline_access", - "microprofile-jwt" - ] - } - ], - "clientScopes": [ - { - "id": "e172a6de-ad7d-4cbd-be06-010d284b6806", - "name": "basic", - "description": "OpenID Connect scope for add all basic claims to the token", - "protocol": "openid-connect", - "attributes": { - "include.in.token.scope": "false", - "display.on.consent.screen": "false" - }, - "protocolMappers": [ - { - "id": "e67f2d9e-7cf0-4875-a72d-ce4a086adf7b", - "name": "auth_time", - "protocol": "openid-connect", - "protocolMapper": "oidc-usersessionmodel-note-mapper", - "consentRequired": false, - "config": { - "user.session.note": "AUTH_TIME", - "introspection.token.claim": "true", - "userinfo.token.claim": "true", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "auth_time", - "jsonType.label": "long" - } - }, - { - "id": "eba73e8f-7d13-46c7-9e6e-44e8839b1022", - "name": "sub", - "protocol": "openid-connect", - "protocolMapper": "oidc-sub-mapper", - "consentRequired": false, - "config": { - "access.token.claim": "true", - "introspection.token.claim": "true" - } - } - ] - }, - { - "id": "76307a43-d2c9-40df-a686-6c4c10e0f70d", - "name": "address", - "description": "OpenID Connect built-in scope: address", - "protocol": "openid-connect", - "attributes": { - "include.in.token.scope": "true", - "consent.screen.text": "${addressScopeConsentText}", - "display.on.consent.screen": "true" - }, - "protocolMappers": [ - { - "id": "32ac1e8f-3680-4c50-8bb4-7eed44c679b1", - "name": "address", - "protocol": "openid-connect", - "protocolMapper": "oidc-address-mapper", - "consentRequired": false, - "config": { - "user.attribute.formatted": "formatted", - "user.attribute.country": "country", - "user.attribute.postal_code": "postal_code", - "userinfo.token.claim": "true", - "user.attribute.street": "street", - "id.token.claim": "true", - "user.attribute.region": "region", - "access.token.claim": "true", - "user.attribute.locality": "locality" - } - } - ] - }, - { - "id": "67a444ee-3246-4878-a525-e0015e9b31cb", - "name": "offline_access", - "description": "OpenID Connect built-in scope: offline_access", - "protocol": "openid-connect", - "attributes": { - "consent.screen.text": "${offlineAccessScopeConsentText}", - "display.on.consent.screen": "true" - } - }, - { - "id": "1e8f098a-66fe-4df2-9547-47be0d040c53", - "name": "email", - "description": "OpenID Connect built-in scope: email", - "protocol": "openid-connect", - "attributes": { - "include.in.token.scope": "true", - "consent.screen.text": "${emailScopeConsentText}", - "display.on.consent.screen": "true" - }, - "protocolMappers": [ - { - "id": "00e95ac6-b825-4180-9558-4dffeac9584a", - "name": "email", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "email", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "email", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - }, - { - "id": "9f5125d5-3b89-4f0f-a13e-b8fbb4d6afc1", - "name": "email verified", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-property-mapper", - "consentRequired": false, - "config": { - "user.attribute": "emailVerified", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "email_verified", - "jsonType.label": "boolean", - "userinfo.token.claim": "true" - } - } - ] - }, - { - "id": "988f9517-5cd2-4b66-90ba-3399d667d0f8", - "name": "role_list", - "description": "SAML role list", - "protocol": "saml", - "attributes": { - "consent.screen.text": "${samlRoleListScopeConsentText}", - "display.on.consent.screen": "true" - }, - "protocolMappers": [ - { - "id": "b78abf35-1108-40e2-a3c8-c6ea4200e817", - "name": "role list", - "protocol": "saml", - "protocolMapper": "saml-role-list-mapper", - "consentRequired": false, - "config": { - "single": "false", - "attribute.nameformat": "Basic", - "attribute.name": "Role" - } - } - ] - }, - { - "id": "f1dcc0f6-63be-4f85-a8cd-d43072e0eba4", - "name": "microprofile-jwt", - "description": "Microprofile - JWT built-in scope", - "protocol": "openid-connect", - "attributes": { - "include.in.token.scope": "true", - "display.on.consent.screen": "false" - }, - "protocolMappers": [ - { - "id": "bf488bdc-2622-45f0-95c2-df2d05fd3fab", - "name": "upn", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "username", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "upn", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - }, - { - "id": "5aa8e8c1-f0d7-46c4-b2da-24aa9608da9f", - "name": "groups", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-realm-role-mapper", - "consentRequired": false, - "config": { - "multivalued": "true", - "userinfo.token.claim": "true", - "user.attribute": "foo", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "groups", - "jsonType.label": "String" - } - } - ] - }, - { - "id": "fe58e218-3aac-4780-8b5e-b61491cd457b", - "name": "profile", - "description": "OpenID Connect built-in scope: profile", - "protocol": "openid-connect", - "attributes": { - "include.in.token.scope": "true", - "consent.screen.text": "${profileScopeConsentText}", - "display.on.consent.screen": "true" - }, - "protocolMappers": [ - { - "id": "e0616aae-d3e0-4911-98b2-db72ad142938", - "name": "nickname", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "nickname", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "nickname", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - }, - { - "id": "49cc1e1d-9401-4b57-b8a9-a37573f2eb06", - "name": "profile", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "profile", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "profile", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - }, - { - "id": "e05eea05-f917-4ef3-a82f-501c82192bd6", - "name": "gender", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "gender", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "gender", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - }, - { - "id": "89c031e1-bfad-4afd-af24-51db2c62a11f", - "name": "username", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "username", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "preferred_username", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - }, - { - "id": "30d27d3e-3b72-49d1-a66f-0466b58dbf3b", - "name": "locale", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "locale", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "locale", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - }, - { - "id": "9fc26d9e-c109-4b30-8ec2-2fc2d95b11d6", - "name": "picture", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "picture", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "picture", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - }, - { - "id": "5c0dbd32-7a45-4dc9-9e4f-37570ebf5d38", - "name": "family name", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "lastName", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "family_name", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - }, - { - "id": "2de0c290-124a-41be-b7d8-f61f63eed5ef", - "name": "full name", - "protocol": "openid-connect", - "protocolMapper": "oidc-full-name-mapper", - "consentRequired": false, - "config": { - "id.token.claim": "true", - "access.token.claim": "true", - "userinfo.token.claim": "true" - } - }, - { - "id": "369e67dd-fd5e-4d90-8d80-c945c7a0c049", - "name": "updated at", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "updatedAt", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "updated_at", - "jsonType.label": "long", - "userinfo.token.claim": "true" - } - }, - { - "id": "7557b943-11a1-42bb-a119-35e8da9fcb99", - "name": "birthdate", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "birthdate", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "birthdate", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - }, - { - "id": "06359527-ce26-45f7-beba-7ccf5e71d6f5", - "name": "given name", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "firstName", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "given_name", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - }, - { - "id": "8f3bfe54-a74a-4eed-b2bd-4157fc574b57", - "name": "middle name", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "middleName", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "middle_name", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - }, - { - "id": "a6cbf817-a0f5-483d-ae1e-c716d04e1645", - "name": "website", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "website", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "website", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - }, - { - "id": "1322fc37-04e4-4e89-99d4-6c304ad36c96", - "name": "zoneinfo", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "zoneinfo", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "zoneinfo", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - } - ] - }, - { - "id": "6aec68b8-7178-449d-9ba6-b6e1c2a9be73", - "name": "service_account", - "description": "Specific scope for a client enabled for service accounts", - "protocol": "openid-connect", - "attributes": { - "include.in.token.scope": "false", - "display.on.consent.screen": "false" - }, - "protocolMappers": [ - { - "id": "91715642-086a-493b-8f01-5c64d408b7e3", - "name": "Client ID", - "protocol": "openid-connect", - "protocolMapper": "oidc-usersessionmodel-note-mapper", - "consentRequired": false, - "config": { - "user.session.note": "client_id", - "introspection.token.claim": "true", - "userinfo.token.claim": "true", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "client_id", - "jsonType.label": "String" - } - }, - { - "id": "78dcf109-44bb-4aca-9540-a8896f26e864", - "name": "Client Host", - "protocol": "openid-connect", - "protocolMapper": "oidc-usersessionmodel-note-mapper", - "consentRequired": false, - "config": { - "user.session.note": "clientHost", - "introspection.token.claim": "true", - "userinfo.token.claim": "true", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "clientHost", - "jsonType.label": "String" - } - }, - { - "id": "e28a076d-9ee0-46ec-a2f0-a147bab66a09", - "name": "Client IP Address", - "protocol": "openid-connect", - "protocolMapper": "oidc-usersessionmodel-note-mapper", - "consentRequired": false, - "config": { - "user.session.note": "clientAddress", - "introspection.token.claim": "true", - "userinfo.token.claim": "true", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "clientAddress", - "jsonType.label": "String" - } - } - ] - }, - { - "id": "e728df12-1bff-418d-a68d-c2036d856db2", - "name": "roles", - "description": "OpenID Connect scope for add user roles to the access token", - "protocol": "openid-connect", - "attributes": { - "include.in.token.scope": "false", - "consent.screen.text": "${rolesScopeConsentText}", - "display.on.consent.screen": "true" - }, - "protocolMappers": [ - { - "id": "993f7f9d-55ba-4c1f-b84a-76e2c733bc94", - "name": "client roles", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-client-role-mapper", - "consentRequired": false, - "config": { - "user.attribute": "foo", - "access.token.claim": "true", - "claim.name": "resource_access.${client_id}.roles", - "jsonType.label": "String", - "multivalued": "true" - } - }, - { - "id": "f0b2b858-1cde-412b-a1c8-8ed3bd4e04d6", - "name": "realm roles", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-realm-role-mapper", - "consentRequired": false, - "config": { - "user.attribute": "foo", - "access.token.claim": "true", - "claim.name": "realm_access.roles", - "jsonType.label": "String", - "multivalued": "true" - } - }, - { - "id": "32ad3286-1486-4196-9232-533af4c10009", - "name": "audience resolve", - "protocol": "openid-connect", - "protocolMapper": "oidc-audience-resolve-mapper", - "consentRequired": false, - "config": {} - } - ] - }, - { - "id": "efee9fbd-1a06-41d4-94d1-16b59f8d9a68", - "name": "web-origins", - "description": "OpenID Connect scope for add allowed web origins to the access token", - "protocol": "openid-connect", - "attributes": { - "include.in.token.scope": "false", - "consent.screen.text": "", - "display.on.consent.screen": "false" - }, - "protocolMappers": [ - { - "id": "61110fbc-75c7-40cd-aca2-9b7a714b0b22", - "name": "allowed web origins", - "protocol": "openid-connect", - "protocolMapper": "oidc-allowed-origins-mapper", - "consentRequired": false, - "config": {} - } - ] - }, - { - "id": "4a0abefc-0423-403d-8383-10f989580c13", - "name": "phone", - "description": "OpenID Connect built-in scope: phone", - "protocol": "openid-connect", - "attributes": { - "include.in.token.scope": "true", - "consent.screen.text": "${phoneScopeConsentText}", - "display.on.consent.screen": "true" - }, - "protocolMappers": [ - { - "id": "acdce654-be20-4386-bd4f-edf2cd868f6b", - "name": "phone number", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "phoneNumber", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "phone_number", - "jsonType.label": "String", - "userinfo.token.claim": "true" - } - }, - { - "id": "37082e43-4429-479d-bd80-7b8d11b17769", - "name": "phone number verified", - "protocol": "openid-connect", - "protocolMapper": "oidc-usermodel-attribute-mapper", - "consentRequired": false, - "config": { - "user.attribute": "phoneNumberVerified", - "id.token.claim": "true", - "access.token.claim": "true", - "claim.name": "phone_number_verified", - "jsonType.label": "boolean", - "userinfo.token.claim": "true" - } - } - ] - }, - { - "id": "1e5f680b-df5f-4d8c-b9c9-52b5445171ce", - "name": "acr", - "description": "OpenID Connect scope for add acr (authentication context class reference) to the token", - "protocol": "openid-connect", - "attributes": { - "include.in.token.scope": "false", - "display.on.consent.screen": "false" - }, - "protocolMappers": [ - { - "id": "590accb2-1b94-452e-bb20-51bc643fe860", - "name": "acr loa level", - "protocol": "openid-connect", - "protocolMapper": "oidc-acr-mapper", - "consentRequired": false, - "config": { - "id.token.claim": "true", - "access.token.claim": "true", - "userinfo.token.claim": "true" - } - } - ] - } - ], - "defaultDefaultClientScopes": [ - "role_list", - "profile", - "email", - "roles", - "web-origins", - "acr", - "basic" - ], - "defaultOptionalClientScopes": [ - "offline_access", - "address", - "phone", - "microprofile-jwt" - ], - "browserSecurityHeaders": { - "contentSecurityPolicyReportOnly": "", - "xContentTypeOptions": "nosniff", - "referrerPolicy": "no-referrer", - "xRobotsTag": "none", - "xFrameOptions": "SAMEORIGIN", - "contentSecurityPolicy": "frame-src 'self'; frame-ancestors 'self'; object-src 'none';", - "xXSSProtection": "1; mode=block", - "strictTransportSecurity": "max-age=31536000; includeSubDomains" - }, - "smtpServer": {}, - "eventsEnabled": false, - "eventsListeners": [ - "jboss-logging" - ], - "enabledEventTypes": [], - "adminEventsEnabled": false, - "adminEventsDetailsEnabled": false, - "identityProviders": [], - "identityProviderMappers": [], - "components": { - "org.keycloak.services.clientregistration.policy.ClientRegistrationPolicy": [ - { - "id": "76bd801e-c608-4338-8198-668c92446a35", - "name": "Full Scope Disabled", - "providerId": "scope", - "subType": "anonymous", - "subComponents": {}, - "config": {} - }, - { - "id": "06472a8f-7614-4022-b08e-62f023a5fe0a", - "name": "Allowed Client Scopes", - "providerId": "allowed-client-templates", - "subType": "anonymous", - "subComponents": {}, - "config": { - "allow-default-scopes": [ - "true" - ] - } - }, - { - "id": "3667ac91-1abf-4124-91e6-ffc803dc29aa", - "name": "Consent Required", - "providerId": "consent-required", - "subType": "anonymous", - "subComponents": {}, - "config": {} - }, - { - "id": "6e0c8a3f-b5f4-4a49-b44c-bde8ae314d89", - "name": "Max Clients Limit", - "providerId": "max-clients", - "subType": "anonymous", - "subComponents": {}, - "config": { - "max-clients": [ - "200" - ] - } - }, - { - "id": "62d78a88-78a2-4ea7-937b-9a062e946108", - "name": "Trusted Hosts", - "providerId": "trusted-hosts", - "subType": "anonymous", - "subComponents": {}, - "config": { - "host-sending-registration-request-must-match": [ - "true" - ], - "client-uris-must-match": [ - "true" - ] - } - }, - { - "id": "0ca9718d-bfca-4059-b7e8-e32ae3f70a7f", - "name": "Allowed Protocol Mapper Types", - "providerId": "allowed-protocol-mappers", - "subType": "authenticated", - "subComponents": {}, - "config": { - "allowed-protocol-mapper-types": [ - "oidc-address-mapper", - "saml-user-property-mapper", - "oidc-usermodel-attribute-mapper", - "oidc-usermodel-property-mapper", - "oidc-full-name-mapper", - "saml-role-list-mapper", - "saml-user-attribute-mapper", - "oidc-sha256-pairwise-sub-mapper" - ] - } - }, - { - "id": "9247c25c-ce3e-4858-8dda-b2c95b2f4d09", - "name": "Allowed Client Scopes", - "providerId": "allowed-client-templates", - "subType": "authenticated", - "subComponents": {}, - "config": { - "allow-default-scopes": [ - "true" - ] - } - }, - { - "id": "2d3e37a6-c167-4992-abf8-8cbe22f1bcb9", - "name": "Allowed Protocol Mapper Types", - "providerId": "allowed-protocol-mappers", - "subType": "anonymous", - "subComponents": {}, - "config": { - "allowed-protocol-mapper-types": [ - "saml-user-property-mapper", - "oidc-full-name-mapper", - "oidc-address-mapper", - "saml-role-list-mapper", - "oidc-usermodel-attribute-mapper", - "oidc-usermodel-property-mapper", - "oidc-sha256-pairwise-sub-mapper", - "saml-user-attribute-mapper" - ] - } - } - ], - "org.keycloak.userprofile.UserProfileProvider": [ - { - "id": "c12df2b1-cd7d-46b7-ba91-b4381a59f487", - "providerId": "declarative-user-profile", - "subComponents": {}, - "config": { - "kc.user.profile.config": [ - "{\"attributes\":[{\"name\":\"username\",\"displayName\":\"${username}\",\"validations\":{\"length\":{\"min\":3,\"max\":255},\"username-prohibited-characters\":{},\"up-username-not-idn-homograph\":{}},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false},{\"name\":\"email\",\"displayName\":\"${email}\",\"validations\":{\"email\":{},\"length\":{\"max\":255}},\"required\":{\"roles\":[\"user\"]},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false},{\"name\":\"firstName\",\"displayName\":\"${firstName}\",\"validations\":{\"length\":{\"max\":255},\"person-name-prohibited-characters\":{}},\"required\":{\"roles\":[\"user\"]},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false},{\"name\":\"lastName\",\"displayName\":\"${lastName}\",\"validations\":{\"length\":{\"max\":255},\"person-name-prohibited-characters\":{}},\"required\":{\"roles\":[\"user\"]},\"permissions\":{\"view\":[\"admin\",\"user\"],\"edit\":[\"admin\",\"user\"]},\"multivalued\":false}],\"groups\":[{\"name\":\"user-metadata\",\"displayHeader\":\"User metadata\",\"displayDescription\":\"Attributes, which refer to user metadata\"}],\"unmanagedAttributePolicy\":\"ENABLED\"}" - ] - } - } - ], - "org.keycloak.keys.KeyProvider": [ - { - "id": "29577a17-9e8a-40cf-b804-cf36c2cf567c", - "name": "hmac-generated-hs512", - "providerId": "hmac-generated", - "subComponents": {}, - "config": { - "priority": [ - "100" - ], - "algorithm": [ - "HS512" - ] - } - }, - { - "id": "48051b03-e0a1-413d-af4a-d9c301f12662", - "name": "rsa-enc-generated", - "providerId": "rsa-enc-generated", - "subComponents": {}, - "config": { - "priority": [ - "100" - ], - "algorithm": [ - "RSA-OAEP" - ] - } - }, - { - "id": "04c1d0e1-6889-48d2-833a-449a2a9e6fe1", - "name": "hmac-generated", - "providerId": "hmac-generated", - "subComponents": {}, - "config": { - "priority": [ - "100" - ], - "algorithm": [ - "HS256" - ] - } - }, - { - "id": "500737be-f83b-4e67-954e-9e71ca7ed1b0", - "name": "rsa-generated", - "providerId": "rsa-generated", - "subComponents": {}, - "config": { - "priority": [ - "100" - ] - } - }, - { - "id": "7842aa88-a8fb-49a2-ac10-e437337e236a", - "name": "aes-generated", - "providerId": "aes-generated", - "subComponents": {}, - "config": { - "priority": [ - "100" - ] - } - } - ] - }, - "internationalizationEnabled": false, - "supportedLocales": [], - "authenticationFlows": [ - { - "id": "43f7c655-a9cd-4d53-8161-3b3d2008c126", - "alias": "Account verification options", - "description": "Method with which to verity the existing account", - "providerId": "basic-flow", - "topLevel": false, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "idp-email-verification", - "authenticatorFlow": false, - "requirement": "ALTERNATIVE", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticatorFlow": true, - "requirement": "ALTERNATIVE", - "priority": 20, - "autheticatorFlow": true, - "flowAlias": "Verify Existing Account by Re-authentication", - "userSetupAllowed": false - } - ] - }, - { - "id": "0f5c2215-5f40-4509-bb6f-f28c9b743388", - "alias": "Browser - Conditional OTP", - "description": "Flow to determine if the OTP is required for the authentication", - "providerId": "basic-flow", - "topLevel": false, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "conditional-user-configured", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticator": "auth-otp-form", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 20, - "autheticatorFlow": false, - "userSetupAllowed": false - } - ] - }, - { - "id": "eb66c86a-efdc-4039-9153-cd4708f39ba7", - "alias": "Direct Grant - Conditional OTP", - "description": "Flow to determine if the OTP is required for the authentication", - "providerId": "basic-flow", - "topLevel": false, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "conditional-user-configured", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticator": "direct-grant-validate-otp", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 20, - "autheticatorFlow": false, - "userSetupAllowed": false - } - ] - }, - { - "id": "e68e679a-5fc1-427b-93c6-5657f3ff6eb1", - "alias": "First broker login - Conditional OTP", - "description": "Flow to determine if the OTP is required for the authentication", - "providerId": "basic-flow", - "topLevel": false, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "conditional-user-configured", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticator": "auth-otp-form", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 20, - "autheticatorFlow": false, - "userSetupAllowed": false - } - ] - }, - { - "id": "e4a832f6-bae3-41c6-8198-5c14c6ddf706", - "alias": "Handle Existing Account", - "description": "Handle what to do if there is existing account with same email/username like authenticated identity provider", - "providerId": "basic-flow", - "topLevel": false, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "idp-confirm-link", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticatorFlow": true, - "requirement": "REQUIRED", - "priority": 20, - "autheticatorFlow": true, - "flowAlias": "Account verification options", - "userSetupAllowed": false - } - ] - }, - { - "id": "2bbaf432-1058-4ee4-a994-d87f1c224032", - "alias": "Reset - Conditional OTP", - "description": "Flow to determine if the OTP should be reset or not. Set to REQUIRED to force.", - "providerId": "basic-flow", - "topLevel": false, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "conditional-user-configured", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticator": "reset-otp", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 20, - "autheticatorFlow": false, - "userSetupAllowed": false - } - ] - }, - { - "id": "352782b8-ddae-4ddc-af19-86a2900ef1f9", - "alias": "User creation or linking", - "description": "Flow for the existing/non-existing user alternatives", - "providerId": "basic-flow", - "topLevel": false, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticatorConfig": "create unique user config", - "authenticator": "idp-create-user-if-unique", - "authenticatorFlow": false, - "requirement": "ALTERNATIVE", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticatorFlow": true, - "requirement": "ALTERNATIVE", - "priority": 20, - "autheticatorFlow": true, - "flowAlias": "Handle Existing Account", - "userSetupAllowed": false - } - ] - }, - { - "id": "fdc0ecfb-67f8-4390-85a0-50ecfdc66800", - "alias": "Verify Existing Account by Re-authentication", - "description": "Reauthentication of existing account", - "providerId": "basic-flow", - "topLevel": false, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "idp-username-password-form", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticatorFlow": true, - "requirement": "CONDITIONAL", - "priority": 20, - "autheticatorFlow": true, - "flowAlias": "First broker login - Conditional OTP", - "userSetupAllowed": false - } - ] - }, - { - "id": "a656206c-59b9-47cf-8880-c0f04f04a0c3", - "alias": "browser", - "description": "browser based authentication", - "providerId": "basic-flow", - "topLevel": true, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "auth-cookie", - "authenticatorFlow": false, - "requirement": "ALTERNATIVE", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticator": "auth-spnego", - "authenticatorFlow": false, - "requirement": "DISABLED", - "priority": 20, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticator": "identity-provider-redirector", - "authenticatorFlow": false, - "requirement": "ALTERNATIVE", - "priority": 25, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticatorFlow": true, - "requirement": "ALTERNATIVE", - "priority": 30, - "autheticatorFlow": true, - "flowAlias": "forms", - "userSetupAllowed": false - } - ] - }, - { - "id": "7616793a-19e4-4d97-b7ae-ab962acaf444", - "alias": "clients", - "description": "Base authentication for clients", - "providerId": "client-flow", - "topLevel": true, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "client-secret", - "authenticatorFlow": false, - "requirement": "ALTERNATIVE", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticator": "client-jwt", - "authenticatorFlow": false, - "requirement": "ALTERNATIVE", - "priority": 20, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticator": "client-secret-jwt", - "authenticatorFlow": false, - "requirement": "ALTERNATIVE", - "priority": 30, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticator": "client-x509", - "authenticatorFlow": false, - "requirement": "ALTERNATIVE", - "priority": 40, - "autheticatorFlow": false, - "userSetupAllowed": false - } - ] - }, - { - "id": "1f5446d7-d5de-47fb-8e15-347105d3d062", - "alias": "direct grant", - "description": "OpenID Connect Resource Owner Grant", - "providerId": "basic-flow", - "topLevel": true, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "direct-grant-validate-username", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticator": "direct-grant-validate-password", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 20, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticatorFlow": true, - "requirement": "CONDITIONAL", - "priority": 30, - "autheticatorFlow": true, - "flowAlias": "Direct Grant - Conditional OTP", - "userSetupAllowed": false - } - ] - }, - { - "id": "a55463dd-3ced-4102-a263-c121db059379", - "alias": "docker auth", - "description": "Used by Docker clients to authenticate against the IDP", - "providerId": "basic-flow", - "topLevel": true, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "docker-http-basic-authenticator", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - } - ] - }, - { - "id": "646a12ee-99e7-41cd-a1ea-3ed5e5a96dcf", - "alias": "first broker login", - "description": "Actions taken after first broker login with identity provider account, which is not yet linked to any Keycloak account", - "providerId": "basic-flow", - "topLevel": true, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticatorConfig": "review profile config", - "authenticator": "idp-review-profile", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticatorFlow": true, - "requirement": "REQUIRED", - "priority": 20, - "autheticatorFlow": true, - "flowAlias": "User creation or linking", - "userSetupAllowed": false - } - ] - }, - { - "id": "03f283e4-7b80-4b38-b90d-33ba8b0a07c3", - "alias": "forms", - "description": "Username, password, otp and other auth forms.", - "providerId": "basic-flow", - "topLevel": false, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "auth-username-password-form", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticatorFlow": true, - "requirement": "CONDITIONAL", - "priority": 20, - "autheticatorFlow": true, - "flowAlias": "Browser - Conditional OTP", - "userSetupAllowed": false - } - ] - }, - { - "id": "047f04f4-b2c9-4aa9-bc38-4ed2c17d3e2c", - "alias": "registration", - "description": "registration flow", - "providerId": "basic-flow", - "topLevel": true, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "registration-page-form", - "authenticatorFlow": true, - "requirement": "REQUIRED", - "priority": 10, - "autheticatorFlow": true, - "flowAlias": "registration form", - "userSetupAllowed": false - } - ] - }, - { - "id": "51cfacd6-9ee8-4fb2-a3fe-9e00246d9877", - "alias": "registration form", - "description": "registration form", - "providerId": "form-flow", - "topLevel": false, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "registration-user-creation", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 20, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticator": "registration-password-action", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 50, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticator": "registration-recaptcha-action", - "authenticatorFlow": false, - "requirement": "DISABLED", - "priority": 60, - "autheticatorFlow": false, - "userSetupAllowed": false - } - ] - }, - { - "id": "28bb511d-c4ea-4bb8-805c-086eeaf7b239", - "alias": "reset credentials", - "description": "Reset credentials for a user if they forgot their password or something", - "providerId": "basic-flow", - "topLevel": true, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "reset-credentials-choose-user", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticator": "reset-credential-email", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 20, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticator": "reset-password", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 30, - "autheticatorFlow": false, - "userSetupAllowed": false - }, - { - "authenticatorFlow": true, - "requirement": "CONDITIONAL", - "priority": 40, - "autheticatorFlow": true, - "flowAlias": "Reset - Conditional OTP", - "userSetupAllowed": false - } - ] - }, - { - "id": "d0189a78-5979-47ce-8536-32c8f6dec1b6", - "alias": "saml ecp", - "description": "SAML ECP Profile Authentication Flow", - "providerId": "basic-flow", - "topLevel": true, - "builtIn": true, - "authenticationExecutions": [ - { - "authenticator": "http-basic-authenticator", - "authenticatorFlow": false, - "requirement": "REQUIRED", - "priority": 10, - "autheticatorFlow": false, - "userSetupAllowed": false - } - ] - } - ], - "authenticatorConfig": [ - { - "id": "09fd7502-4e05-437f-865a-221fa1297e67", - "alias": "create unique user config", - "config": { - "require.password.update.after.registration": "false" - } - }, - { - "id": "9abca294-1e03-418f-841c-18b00053f949", - "alias": "review profile config", - "config": { - "update.profile.on.first.login": "missing" - } - } - ], - "requiredActions": [ - { - "alias": "CONFIGURE_TOTP", - "name": "Configure OTP", - "providerId": "CONFIGURE_TOTP", - "enabled": true, - "defaultAction": false, - "priority": 10, - "config": {} - }, - { - "alias": "TERMS_AND_CONDITIONS", - "name": "Terms and Conditions", - "providerId": "TERMS_AND_CONDITIONS", - "enabled": false, - "defaultAction": false, - "priority": 20, - "config": {} - }, - { - "alias": "UPDATE_PASSWORD", - "name": "Update Password", - "providerId": "UPDATE_PASSWORD", - "enabled": true, - "defaultAction": false, - "priority": 30, - "config": {} - }, - { - "alias": "UPDATE_PROFILE", - "name": "Update Profile", - "providerId": "UPDATE_PROFILE", - "enabled": true, - "defaultAction": false, - "priority": 40, - "config": {} - }, - { - "alias": "VERIFY_EMAIL", - "name": "Verify Email", - "providerId": "VERIFY_EMAIL", - "enabled": true, - "defaultAction": false, - "priority": 50, - "config": {} - }, - { - "alias": "delete_account", - "name": "Delete Account", - "providerId": "delete_account", - "enabled": false, - "defaultAction": false, - "priority": 60, - "config": {} - }, - { - "alias": "webauthn-register", - "name": "Webauthn Register", - "providerId": "webauthn-register", - "enabled": true, - "defaultAction": false, - "priority": 70, - "config": {} - }, - { - "alias": "webauthn-register-passwordless", - "name": "Webauthn Register Passwordless", - "providerId": "webauthn-register-passwordless", - "enabled": true, - "defaultAction": false, - "priority": 80, - "config": {} - }, - { - "alias": "delete_credential", - "name": "Delete Credential", - "providerId": "delete_credential", - "enabled": true, - "defaultAction": false, - "priority": 100, - "config": {} - }, - { - "alias": "update_user_locale", - "name": "Update User Locale", - "providerId": "update_user_locale", - "enabled": true, - "defaultAction": false, - "priority": 1000, - "config": {} - } - ], - "browserFlow": "browser", - "registrationFlow": "registration", - "directGrantFlow": "direct grant", - "resetCredentialsFlow": "reset credentials", - "clientAuthenticationFlow": "clients", - "dockerAuthenticationFlow": "docker auth", - "firstBrokerLoginFlow": "first broker login", - "attributes": { - "cibaBackchannelTokenDeliveryMode": "poll", - "cibaExpiresIn": "120", - "cibaAuthRequestedUserHint": "login_hint", - "oauth2DeviceCodeLifespan": "600", - "clientOfflineSessionMaxLifespan": "0", - "oauth2DevicePollingInterval": "5", - "clientSessionIdleTimeout": "0", - "parRequestUriLifespan": "60", - "clientSessionMaxLifespan": "0", - "clientOfflineSessionIdleTimeout": "0", - "cibaInterval": "5", - "realmReusableOtpCode": "false" - }, - "keycloakVersion": "26.1.1", - "userManagedAccessAllowed": false, - "organizationsEnabled": false, - "verifiableCredentialsEnabled": false, - "adminPermissionsEnabled": false, - "clientProfiles": { - "profiles": [] - }, - "clientPolicies": { - "policies": [] - } -} diff --git a/applications/osmo/deploy/002a-setup/values/gpu-operator.yaml b/applications/osmo/deploy/002a-setup/values/gpu-operator.yaml deleted file mode 100755 index 11cc02fdf..000000000 --- a/applications/osmo/deploy/002a-setup/values/gpu-operator.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# GPU Operator Helm Values -# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator -# https://docs.nebius.com/kubernetes/gpu/set-up - -operator: - defaultRuntime: containerd - -# Enable driver installation by GPU Operator -# Even though Nebius nodes may have pre-installed drivers, the GPU Operator -# needs to manage the driver lifecycle for proper integration with device-plugin, -# toolkit, and other components. -driver: - enabled: true - # Let GPU Operator choose the appropriate driver version - # version: auto-detected by operator - upgradePolicy: - autoUpgrade: false # Don't auto-upgrade to avoid conflicts - -toolkit: - enabled: true - -devicePlugin: - enabled: true - config: - default: "any" - -dcgm: - enabled: true - -dcgmExporter: - enabled: true - serviceMonitor: - enabled: true - -gfd: - enabled: true - -migManager: - enabled: false - -nodeStatusExporter: - enabled: true - -# Node selector for GPU operator pods -node-feature-discovery: - worker: - tolerations: - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - -# Tolerations for GPU workloads -daemonsets: - tolerations: - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule diff --git a/applications/osmo/deploy/002a-setup/values/grafana.yaml b/applications/osmo/deploy/002a-setup/values/grafana.yaml deleted file mode 100755 index ab8dd6b6b..000000000 --- a/applications/osmo/deploy/002a-setup/values/grafana.yaml +++ /dev/null @@ -1,70 +0,0 @@ -# Grafana Helm Values (standalone) -# https://github.com/grafana/helm-charts/tree/main/charts/grafana - -# Note: Grafana is typically deployed as part of kube-prometheus-stack -# This file is for standalone Grafana deployment if needed - -replicas: 1 - -adminUser: admin -# adminPassword should be set via --set or secret - -persistence: - enabled: true - size: 10Gi - storageClassName: "" - -resources: - requests: - cpu: 100m - memory: 256Mi - limits: - cpu: 500m - memory: 512Mi - -# Datasources -datasources: - datasources.yaml: - apiVersion: 1 - datasources: - - name: Prometheus - type: prometheus - url: http://prometheus-kube-prometheus-prometheus:9090 - access: proxy - isDefault: true - - name: Loki - type: loki - url: http://loki:3100 - access: proxy - -# Dashboard providers -dashboardProviders: - dashboardproviders.yaml: - apiVersion: 1 - providers: - - name: 'default' - orgId: 1 - folder: '' - type: file - disableDeletion: false - editable: true - options: - path: /var/lib/grafana/dashboards/default - -# Sidecar for dashboards -sidecar: - dashboards: - enabled: true - label: grafana_dashboard - datasources: - enabled: true - label: grafana_datasource - -# Service -service: - type: ClusterIP - port: 80 - -# Ingress (disabled by default) -ingress: - enabled: false diff --git a/applications/osmo/deploy/002a-setup/values/kai-scheduler.yaml b/applications/osmo/deploy/002a-setup/values/kai-scheduler.yaml deleted file mode 100755 index 320c867db..000000000 --- a/applications/osmo/deploy/002a-setup/values/kai-scheduler.yaml +++ /dev/null @@ -1,13 +0,0 @@ -# KAI Scheduler Helm Values -# GPU-aware scheduler for OSMO -# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/dependencies/dependencies.html - -global: - # Modify the node selectors and tolerations to match your cluster - nodeSelector: {} - tolerations: [] - -scheduler: - additionalArgs: - - --default-staleness-grace-period=-1s # Disable staleness eviction - - --update-pod-eviction-condition=true # Enable OSMO to read preemption conditions diff --git a/applications/osmo/deploy/002a-setup/values/loki.yaml b/applications/osmo/deploy/002a-setup/values/loki.yaml deleted file mode 100755 index f4c277a22..000000000 --- a/applications/osmo/deploy/002a-setup/values/loki.yaml +++ /dev/null @@ -1,68 +0,0 @@ -# Loki Stack Helm Values -# https://github.com/grafana/helm-charts/tree/main/charts/loki-stack - -loki: - enabled: true - - persistence: - enabled: true - size: 50Gi - - config: - auth_enabled: false - - server: - http_listen_port: 3100 - - ingester: - lifecycler: - ring: - kvstore: - store: inmemory - replication_factor: 1 - chunk_idle_period: 15m - chunk_retain_period: 30s - - schema_config: - configs: - - from: 2020-01-01 - store: boltdb-shipper - object_store: filesystem - schema: v11 - index: - prefix: index_ - period: 24h - - storage_config: - boltdb_shipper: - active_index_directory: /data/loki/boltdb-shipper-active - cache_location: /data/loki/boltdb-shipper-cache - shared_store: filesystem - filesystem: - directory: /data/loki/chunks - - limits_config: - enforce_metric_name: false - reject_old_samples: true - reject_old_samples_max_age: 168h - max_entries_limit_per_query: 5000 - - table_manager: - retention_deletes_enabled: true - retention_period: 168h # 7 days - - resources: - requests: - cpu: 100m - memory: 256Mi - limits: - cpu: 500m - memory: 1Gi - -# Promtail is deployed separately -promtail: - enabled: false - -# Grafana is deployed via kube-prometheus-stack -grafana: - enabled: false diff --git a/applications/osmo/deploy/002a-setup/values/network-operator.yaml b/applications/osmo/deploy/002a-setup/values/network-operator.yaml deleted file mode 100755 index 146a9daca..000000000 --- a/applications/osmo/deploy/002a-setup/values/network-operator.yaml +++ /dev/null @@ -1,62 +0,0 @@ -# Network Operator Helm Values -# https://docs.nvidia.com/networking/display/cokan10/network+operator - -# Operator settings -operator: - nodeSelector: - node-role.kubernetes.io/control-plane: "" - tolerations: - - key: node-role.kubernetes.io/master - operator: Exists - effect: NoSchedule - - key: node-role.kubernetes.io/control-plane - operator: Exists - effect: NoSchedule - -# RDMA shared device plugin (for InfiniBand) -rdmaSharedDevicePlugin: - deploy: true - resources: - - name: rdma_shared_device_a - vendors: [15b3] - deviceIDs: [101b, 101d, 1017, 1019] - ifNames: ["*"] - -# SR-IOV device plugin -sriovDevicePlugin: - deploy: false - -# NIC cluster policy -nicClusterPolicy: - deploy: true - - # RDMA - rdmaSharedDevicePlugin: - image: k8s-rdma-shared-dev-plugin - repository: ghcr.io/mellanox - version: sha-4f3eb55 - -# Secondary network -secondaryNetwork: - deploy: true - - # Multus CNI - multus: - deploy: true - image: multus-cni - repository: ghcr.io/k8snetworkplumbingwg - version: v3.9.3 - - # CNI plugins - cniPlugins: - deploy: true - image: plugins - repository: ghcr.io/k8snetworkplumbingwg - version: v1.3.0 - - # IPAM plugin - ipamPlugin: - deploy: true - image: whereabouts - repository: ghcr.io/k8snetworkplumbingwg - version: v0.6.2 diff --git a/applications/osmo/deploy/002a-setup/values/osmo-backend-operator.yaml b/applications/osmo/deploy/002a-setup/values/osmo-backend-operator.yaml deleted file mode 100755 index b4781ae21..000000000 --- a/applications/osmo/deploy/002a-setup/values/osmo-backend-operator.yaml +++ /dev/null @@ -1,37 +0,0 @@ -# OSMO Backend Operator Values -# https://nvidia.github.io/OSMO/main/deployment_guide/install_backend/deploy_backend.html - -global: - # REQUIRED: OSMO image tag (e.g., 6.0.0) - osmoImageTag: "6.0.0" - - # REQUIRED: Your OSMO service URL - serviceUrl: "https://osmo.example.com" - - # Namespaces - agentNamespace: "osmo-operator" - backendNamespace: "osmo-workflows" - - # REQUIRED: Unique name for this backend - backendName: "nebius-backend" - - # Authentication - accountTokenSecret: "osmo-operator-token" - loginMethod: "token" - - # Resource configuration - services: - backendListener: - resources: - requests: - cpu: "1" - memory: "1Gi" - limits: - memory: "1Gi" - backendWorker: - resources: - requests: - cpu: "1" - memory: "1Gi" - limits: - memory: "1Gi" diff --git a/applications/osmo/deploy/002a-setup/values/prometheus.yaml b/applications/osmo/deploy/002a-setup/values/prometheus.yaml deleted file mode 100755 index 12cc634d9..000000000 --- a/applications/osmo/deploy/002a-setup/values/prometheus.yaml +++ /dev/null @@ -1,109 +0,0 @@ -# Prometheus Stack Helm Values -# https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack - -# Prometheus -prometheus: - prometheusSpec: - # Some CRDs require this to be >= 60 - maximumStartupDurationSeconds: 60 - retention: 15d - - resources: - requests: - cpu: 500m - memory: 2Gi - limits: - cpu: 2000m - memory: 8Gi - - storageSpec: - volumeClaimTemplate: - spec: - accessModes: ["ReadWriteOnce"] - resources: - requests: - storage: 50Gi - - # Service monitors - serviceMonitorSelectorNilUsesHelmValues: false - podMonitorSelectorNilUsesHelmValues: false - -# Grafana -grafana: - enabled: true - - adminUser: admin - # adminPassword is set via --set flag - - persistence: - enabled: true - size: 10Gi - - resources: - requests: - cpu: 100m - memory: 256Mi - limits: - cpu: 500m - memory: 512Mi - - # Additional datasources - additionalDataSources: - - name: Loki - type: loki - url: http://loki:3100 - access: proxy - isDefault: false - - # Dashboards - dashboardProviders: - dashboardproviders.yaml: - apiVersion: 1 - providers: - - name: 'default' - orgId: 1 - folder: '' - type: file - disableDeletion: false - editable: true - options: - path: /var/lib/grafana/dashboards/default - - # GPU dashboard - dashboards: - default: - nvidia-dcgm: - gnetId: 12239 - revision: 2 - datasource: Prometheus - -# Alertmanager -alertmanager: - enabled: true - - alertmanagerSpec: - resources: - requests: - cpu: 50m - memory: 64Mi - limits: - cpu: 200m - memory: 256Mi - -# Node exporter -nodeExporter: - enabled: true - -# Kube state metrics -kubeStateMetrics: - enabled: true - -# Prometheus operator -prometheusOperator: - resources: - requests: - cpu: 100m - memory: 128Mi - limits: - cpu: 500m - memory: 512Mi diff --git a/applications/osmo/deploy/002a-setup/values/promtail.yaml b/applications/osmo/deploy/002a-setup/values/promtail.yaml deleted file mode 100755 index 601d29e57..000000000 --- a/applications/osmo/deploy/002a-setup/values/promtail.yaml +++ /dev/null @@ -1,46 +0,0 @@ -# Promtail Helm Values -# https://github.com/grafana/helm-charts/tree/main/charts/promtail - -config: - clients: - - url: http://loki:3100/loki/api/v1/push - - snippets: - pipelineStages: - - cri: {} - - json: - expressions: - level: level - message: msg - - labels: - level: - - output: - source: message - -# Resources -resources: - requests: - cpu: 50m - memory: 64Mi - limits: - cpu: 200m - memory: 256Mi - -# Tolerations to run on all nodes -tolerations: - - key: node-role.kubernetes.io/control-plane - operator: Exists - effect: NoSchedule - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - -# Volume mounts (for containerd logs if needed) -# Note: The default chart already mounts /var/lib/docker and /var/log -# Only add extra volumes if you need additional paths -extraVolumes: [] -extraVolumeMounts: [] - -# Service monitor -serviceMonitor: - enabled: true diff --git a/applications/osmo/workflows/osmo/test_bucket_write.yaml b/applications/osmo/workflows/osmo/test_bucket_write.yaml new file mode 100755 index 000000000..05ff9456d --- /dev/null +++ b/applications/osmo/workflows/osmo/test_bucket_write.yaml @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Test workflow: writes test.txt and uploads it to the default dataset bucket. +# Use after configuring the Nebius bucket as the default (10-configure-dataset-bucket.sh). +# Submit: osmo workflow submit workflows/osmo/test_bucket_write.yaml + +workflow: + name: test-bucket-write4 + resources: + default: + platform: gpu + gpu: 1 + cpu: 2 + memory: 2Gi + storage: 1Gi + tasks: + - name: write-test-file + image: ubuntu:24.04 + command: ["bash", "-c"] + args: + - | + echo "OSMO default bucket test at MEOW $(date -Iseconds)" > {{output}}/test.txt + echo "Wrote test.txt to task output (will be uploaded to default bucket)" + cat {{output}}/test.txt + echo "Spinning for 10 seconds before stopping..." + sleep 10 + echo "Done." + outputs: + - dataset: + name: datasetv004 \ No newline at end of file From 725d783488ab69f0b82de4bd563dcfbddd9830bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Fri, 20 Feb 2026 08:24:01 +0100 Subject: [PATCH 23/37] - refactor folder structure --- .../osmo/deploy/{ => example}/000-prerequisites/README.md | 0 .../osmo/deploy/{ => example}/000-prerequisites/install-tools.sh | 0 .../deploy/{ => example}/000-prerequisites/nebius-env-init.sh | 0 .../osmo/deploy/{ => example}/000-prerequisites/secrets-init.sh | 0 .../{ => example}/000-prerequisites/wireguard-client-setup.sh | 0 applications/osmo/deploy/{ => example}/001-iac/README.md | 0 applications/osmo/deploy/{ => example}/001-iac/locals.tf | 0 applications/osmo/deploy/{ => example}/001-iac/main.tf | 0 .../osmo/deploy/{ => example}/001-iac/modules/k8s/main.tf | 0 .../osmo/deploy/{ => example}/001-iac/modules/k8s/outputs.tf | 0 .../{ => example}/001-iac/modules/k8s/templates/cloud-init.yaml | 0 .../osmo/deploy/{ => example}/001-iac/modules/k8s/variables.tf | 0 .../osmo/deploy/{ => example}/001-iac/modules/k8s/versions.tf | 0 .../osmo/deploy/{ => example}/001-iac/modules/platform/main.tf | 0 .../osmo/deploy/{ => example}/001-iac/modules/platform/outputs.tf | 0 .../deploy/{ => example}/001-iac/modules/platform/variables.tf | 0 .../deploy/{ => example}/001-iac/modules/platform/versions.tf | 0 .../osmo/deploy/{ => example}/001-iac/modules/wireguard/main.tf | 0 .../deploy/{ => example}/001-iac/modules/wireguard/outputs.tf | 0 .../001-iac/modules/wireguard/templates/cloud-init.yaml | 0 .../deploy/{ => example}/001-iac/modules/wireguard/variables.tf | 0 .../deploy/{ => example}/001-iac/modules/wireguard/versions.tf | 0 applications/osmo/deploy/{ => example}/001-iac/outputs.tf | 0 .../001-iac/terraform.tfvars.cost-optimized-secure.example | 0 .../{ => example}/001-iac/terraform.tfvars.cost-optimized.example | 0 .../osmo/deploy/{ => example}/001-iac/terraform.tfvars.example | 0 .../{ => example}/001-iac/terraform.tfvars.production.example | 0 .../deploy/{ => example}/001-iac/terraform.tfvars.secure.example | 0 applications/osmo/deploy/{ => example}/001-iac/variables.tf | 0 applications/osmo/deploy/{ => example}/001-iac/versions.tf | 0 .../{ => example}/002-setup/01-deploy-gpu-infrastructure.sh | 0 .../deploy/{ => example}/002-setup/02-deploy-observability.sh | 0 .../deploy/{ => example}/002-setup/03-deploy-nginx-ingress.sh | 0 .../{ => example}/002-setup/04-deploy-osmo-control-plane.sh | 0 applications/osmo/deploy/{ => example}/002-setup/04-enable-tls.sh | 0 .../osmo/deploy/{ => example}/002-setup/05-deploy-osmo-backend.sh | 0 .../osmo/deploy/{ => example}/002-setup/06-configure-storage.sh | 0 .../deploy/{ => example}/002-setup/07-configure-service-url.sh | 0 .../deploy/{ => example}/002-setup/08-configure-gpu-platform.sh | 0 .../{ => example}/002-setup/09-configure-backend-scheduler.sh | 0 .../deploy/{ => example}/002-setup/10-configure-dataset-bucket.sh | 0 applications/osmo/deploy/{ => example}/002-setup/README.md | 0 .../002-setup/cleanup/uninstall-gpu-infrastructure.sh | 0 .../deploy/{ => example}/002-setup/cleanup/uninstall-keycloak.sh | 0 .../{ => example}/002-setup/cleanup/uninstall-nginx-ingress.sh | 0 .../{ => example}/002-setup/cleanup/uninstall-observability.sh | 0 .../{ => example}/002-setup/cleanup/uninstall-osmo-backend.sh | 0 .../002-setup/cleanup/uninstall-osmo-control-plane.sh | 0 applications/osmo/deploy/{ => example}/002-setup/defaults.sh | 0 .../osmo/deploy/{ => example}/002-setup/gpu_platform_update.json | 0 .../osmo/deploy/{ => example}/002-setup/gpu_pod_template.json | 0 applications/osmo/deploy/{ => example}/002-setup/lib/common.sh | 0 .../osmo/deploy/{ => example}/002-setup/osmo-values-noauth.yaml | 0 .../osmo/deploy/{ => example}/002-setup/sample_osmo_realm.json | 0 .../osmo/deploy/{ => example}/002-setup/values/gpu-operator.yaml | 0 .../osmo/deploy/{ => example}/002-setup/values/grafana.yaml | 0 .../osmo/deploy/{ => example}/002-setup/values/kai-scheduler.yaml | 0 applications/osmo/deploy/{ => example}/002-setup/values/loki.yaml | 0 .../deploy/{ => example}/002-setup/values/network-operator.yaml | 0 .../{ => example}/002-setup/values/osmo-backend-operator.yaml | 0 .../osmo/deploy/{ => example}/002-setup/values/prometheus.yaml | 0 .../osmo/deploy/{ => example}/002-setup/values/promtail.yaml | 0 applications/osmo/deploy/{ => example}/README.md | 0 63 files changed, 0 insertions(+), 0 deletions(-) rename applications/osmo/deploy/{ => example}/000-prerequisites/README.md (100%) rename applications/osmo/deploy/{ => example}/000-prerequisites/install-tools.sh (100%) rename applications/osmo/deploy/{ => example}/000-prerequisites/nebius-env-init.sh (100%) rename applications/osmo/deploy/{ => example}/000-prerequisites/secrets-init.sh (100%) rename applications/osmo/deploy/{ => example}/000-prerequisites/wireguard-client-setup.sh (100%) rename applications/osmo/deploy/{ => example}/001-iac/README.md (100%) rename applications/osmo/deploy/{ => example}/001-iac/locals.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/main.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/modules/k8s/main.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/modules/k8s/outputs.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/modules/k8s/templates/cloud-init.yaml (100%) rename applications/osmo/deploy/{ => example}/001-iac/modules/k8s/variables.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/modules/k8s/versions.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/modules/platform/main.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/modules/platform/outputs.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/modules/platform/variables.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/modules/platform/versions.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/modules/wireguard/main.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/modules/wireguard/outputs.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/modules/wireguard/templates/cloud-init.yaml (100%) rename applications/osmo/deploy/{ => example}/001-iac/modules/wireguard/variables.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/modules/wireguard/versions.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/outputs.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/terraform.tfvars.cost-optimized-secure.example (100%) rename applications/osmo/deploy/{ => example}/001-iac/terraform.tfvars.cost-optimized.example (100%) rename applications/osmo/deploy/{ => example}/001-iac/terraform.tfvars.example (100%) rename applications/osmo/deploy/{ => example}/001-iac/terraform.tfvars.production.example (100%) rename applications/osmo/deploy/{ => example}/001-iac/terraform.tfvars.secure.example (100%) rename applications/osmo/deploy/{ => example}/001-iac/variables.tf (100%) rename applications/osmo/deploy/{ => example}/001-iac/versions.tf (100%) rename applications/osmo/deploy/{ => example}/002-setup/01-deploy-gpu-infrastructure.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/02-deploy-observability.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/03-deploy-nginx-ingress.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/04-deploy-osmo-control-plane.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/04-enable-tls.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/05-deploy-osmo-backend.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/06-configure-storage.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/07-configure-service-url.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/08-configure-gpu-platform.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/09-configure-backend-scheduler.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/10-configure-dataset-bucket.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/README.md (100%) rename applications/osmo/deploy/{ => example}/002-setup/cleanup/uninstall-gpu-infrastructure.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/cleanup/uninstall-keycloak.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/cleanup/uninstall-nginx-ingress.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/cleanup/uninstall-observability.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/cleanup/uninstall-osmo-backend.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/cleanup/uninstall-osmo-control-plane.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/defaults.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/gpu_platform_update.json (100%) rename applications/osmo/deploy/{ => example}/002-setup/gpu_pod_template.json (100%) rename applications/osmo/deploy/{ => example}/002-setup/lib/common.sh (100%) rename applications/osmo/deploy/{ => example}/002-setup/osmo-values-noauth.yaml (100%) rename applications/osmo/deploy/{ => example}/002-setup/sample_osmo_realm.json (100%) rename applications/osmo/deploy/{ => example}/002-setup/values/gpu-operator.yaml (100%) rename applications/osmo/deploy/{ => example}/002-setup/values/grafana.yaml (100%) rename applications/osmo/deploy/{ => example}/002-setup/values/kai-scheduler.yaml (100%) rename applications/osmo/deploy/{ => example}/002-setup/values/loki.yaml (100%) rename applications/osmo/deploy/{ => example}/002-setup/values/network-operator.yaml (100%) rename applications/osmo/deploy/{ => example}/002-setup/values/osmo-backend-operator.yaml (100%) rename applications/osmo/deploy/{ => example}/002-setup/values/prometheus.yaml (100%) rename applications/osmo/deploy/{ => example}/002-setup/values/promtail.yaml (100%) rename applications/osmo/deploy/{ => example}/README.md (100%) diff --git a/applications/osmo/deploy/000-prerequisites/README.md b/applications/osmo/deploy/example/000-prerequisites/README.md similarity index 100% rename from applications/osmo/deploy/000-prerequisites/README.md rename to applications/osmo/deploy/example/000-prerequisites/README.md diff --git a/applications/osmo/deploy/000-prerequisites/install-tools.sh b/applications/osmo/deploy/example/000-prerequisites/install-tools.sh similarity index 100% rename from applications/osmo/deploy/000-prerequisites/install-tools.sh rename to applications/osmo/deploy/example/000-prerequisites/install-tools.sh diff --git a/applications/osmo/deploy/000-prerequisites/nebius-env-init.sh b/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh similarity index 100% rename from applications/osmo/deploy/000-prerequisites/nebius-env-init.sh rename to applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh diff --git a/applications/osmo/deploy/000-prerequisites/secrets-init.sh b/applications/osmo/deploy/example/000-prerequisites/secrets-init.sh similarity index 100% rename from applications/osmo/deploy/000-prerequisites/secrets-init.sh rename to applications/osmo/deploy/example/000-prerequisites/secrets-init.sh diff --git a/applications/osmo/deploy/000-prerequisites/wireguard-client-setup.sh b/applications/osmo/deploy/example/000-prerequisites/wireguard-client-setup.sh similarity index 100% rename from applications/osmo/deploy/000-prerequisites/wireguard-client-setup.sh rename to applications/osmo/deploy/example/000-prerequisites/wireguard-client-setup.sh diff --git a/applications/osmo/deploy/001-iac/README.md b/applications/osmo/deploy/example/001-iac/README.md similarity index 100% rename from applications/osmo/deploy/001-iac/README.md rename to applications/osmo/deploy/example/001-iac/README.md diff --git a/applications/osmo/deploy/001-iac/locals.tf b/applications/osmo/deploy/example/001-iac/locals.tf similarity index 100% rename from applications/osmo/deploy/001-iac/locals.tf rename to applications/osmo/deploy/example/001-iac/locals.tf diff --git a/applications/osmo/deploy/001-iac/main.tf b/applications/osmo/deploy/example/001-iac/main.tf similarity index 100% rename from applications/osmo/deploy/001-iac/main.tf rename to applications/osmo/deploy/example/001-iac/main.tf diff --git a/applications/osmo/deploy/001-iac/modules/k8s/main.tf b/applications/osmo/deploy/example/001-iac/modules/k8s/main.tf similarity index 100% rename from applications/osmo/deploy/001-iac/modules/k8s/main.tf rename to applications/osmo/deploy/example/001-iac/modules/k8s/main.tf diff --git a/applications/osmo/deploy/001-iac/modules/k8s/outputs.tf b/applications/osmo/deploy/example/001-iac/modules/k8s/outputs.tf similarity index 100% rename from applications/osmo/deploy/001-iac/modules/k8s/outputs.tf rename to applications/osmo/deploy/example/001-iac/modules/k8s/outputs.tf diff --git a/applications/osmo/deploy/001-iac/modules/k8s/templates/cloud-init.yaml b/applications/osmo/deploy/example/001-iac/modules/k8s/templates/cloud-init.yaml similarity index 100% rename from applications/osmo/deploy/001-iac/modules/k8s/templates/cloud-init.yaml rename to applications/osmo/deploy/example/001-iac/modules/k8s/templates/cloud-init.yaml diff --git a/applications/osmo/deploy/001-iac/modules/k8s/variables.tf b/applications/osmo/deploy/example/001-iac/modules/k8s/variables.tf similarity index 100% rename from applications/osmo/deploy/001-iac/modules/k8s/variables.tf rename to applications/osmo/deploy/example/001-iac/modules/k8s/variables.tf diff --git a/applications/osmo/deploy/001-iac/modules/k8s/versions.tf b/applications/osmo/deploy/example/001-iac/modules/k8s/versions.tf similarity index 100% rename from applications/osmo/deploy/001-iac/modules/k8s/versions.tf rename to applications/osmo/deploy/example/001-iac/modules/k8s/versions.tf diff --git a/applications/osmo/deploy/001-iac/modules/platform/main.tf b/applications/osmo/deploy/example/001-iac/modules/platform/main.tf similarity index 100% rename from applications/osmo/deploy/001-iac/modules/platform/main.tf rename to applications/osmo/deploy/example/001-iac/modules/platform/main.tf diff --git a/applications/osmo/deploy/001-iac/modules/platform/outputs.tf b/applications/osmo/deploy/example/001-iac/modules/platform/outputs.tf similarity index 100% rename from applications/osmo/deploy/001-iac/modules/platform/outputs.tf rename to applications/osmo/deploy/example/001-iac/modules/platform/outputs.tf diff --git a/applications/osmo/deploy/001-iac/modules/platform/variables.tf b/applications/osmo/deploy/example/001-iac/modules/platform/variables.tf similarity index 100% rename from applications/osmo/deploy/001-iac/modules/platform/variables.tf rename to applications/osmo/deploy/example/001-iac/modules/platform/variables.tf diff --git a/applications/osmo/deploy/001-iac/modules/platform/versions.tf b/applications/osmo/deploy/example/001-iac/modules/platform/versions.tf similarity index 100% rename from applications/osmo/deploy/001-iac/modules/platform/versions.tf rename to applications/osmo/deploy/example/001-iac/modules/platform/versions.tf diff --git a/applications/osmo/deploy/001-iac/modules/wireguard/main.tf b/applications/osmo/deploy/example/001-iac/modules/wireguard/main.tf similarity index 100% rename from applications/osmo/deploy/001-iac/modules/wireguard/main.tf rename to applications/osmo/deploy/example/001-iac/modules/wireguard/main.tf diff --git a/applications/osmo/deploy/001-iac/modules/wireguard/outputs.tf b/applications/osmo/deploy/example/001-iac/modules/wireguard/outputs.tf similarity index 100% rename from applications/osmo/deploy/001-iac/modules/wireguard/outputs.tf rename to applications/osmo/deploy/example/001-iac/modules/wireguard/outputs.tf diff --git a/applications/osmo/deploy/001-iac/modules/wireguard/templates/cloud-init.yaml b/applications/osmo/deploy/example/001-iac/modules/wireguard/templates/cloud-init.yaml similarity index 100% rename from applications/osmo/deploy/001-iac/modules/wireguard/templates/cloud-init.yaml rename to applications/osmo/deploy/example/001-iac/modules/wireguard/templates/cloud-init.yaml diff --git a/applications/osmo/deploy/001-iac/modules/wireguard/variables.tf b/applications/osmo/deploy/example/001-iac/modules/wireguard/variables.tf similarity index 100% rename from applications/osmo/deploy/001-iac/modules/wireguard/variables.tf rename to applications/osmo/deploy/example/001-iac/modules/wireguard/variables.tf diff --git a/applications/osmo/deploy/001-iac/modules/wireguard/versions.tf b/applications/osmo/deploy/example/001-iac/modules/wireguard/versions.tf similarity index 100% rename from applications/osmo/deploy/001-iac/modules/wireguard/versions.tf rename to applications/osmo/deploy/example/001-iac/modules/wireguard/versions.tf diff --git a/applications/osmo/deploy/001-iac/outputs.tf b/applications/osmo/deploy/example/001-iac/outputs.tf similarity index 100% rename from applications/osmo/deploy/001-iac/outputs.tf rename to applications/osmo/deploy/example/001-iac/outputs.tf diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized-secure.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized-secure.example similarity index 100% rename from applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized-secure.example rename to applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized-secure.example diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized.example similarity index 100% rename from applications/osmo/deploy/001-iac/terraform.tfvars.cost-optimized.example rename to applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized.example diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.example similarity index 100% rename from applications/osmo/deploy/001-iac/terraform.tfvars.example rename to applications/osmo/deploy/example/001-iac/terraform.tfvars.example diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.production.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.production.example similarity index 100% rename from applications/osmo/deploy/001-iac/terraform.tfvars.production.example rename to applications/osmo/deploy/example/001-iac/terraform.tfvars.production.example diff --git a/applications/osmo/deploy/001-iac/terraform.tfvars.secure.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.secure.example similarity index 100% rename from applications/osmo/deploy/001-iac/terraform.tfvars.secure.example rename to applications/osmo/deploy/example/001-iac/terraform.tfvars.secure.example diff --git a/applications/osmo/deploy/001-iac/variables.tf b/applications/osmo/deploy/example/001-iac/variables.tf similarity index 100% rename from applications/osmo/deploy/001-iac/variables.tf rename to applications/osmo/deploy/example/001-iac/variables.tf diff --git a/applications/osmo/deploy/001-iac/versions.tf b/applications/osmo/deploy/example/001-iac/versions.tf similarity index 100% rename from applications/osmo/deploy/001-iac/versions.tf rename to applications/osmo/deploy/example/001-iac/versions.tf diff --git a/applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh b/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh similarity index 100% rename from applications/osmo/deploy/002-setup/01-deploy-gpu-infrastructure.sh rename to applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh diff --git a/applications/osmo/deploy/002-setup/02-deploy-observability.sh b/applications/osmo/deploy/example/002-setup/02-deploy-observability.sh similarity index 100% rename from applications/osmo/deploy/002-setup/02-deploy-observability.sh rename to applications/osmo/deploy/example/002-setup/02-deploy-observability.sh diff --git a/applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh b/applications/osmo/deploy/example/002-setup/03-deploy-nginx-ingress.sh similarity index 100% rename from applications/osmo/deploy/002-setup/03-deploy-nginx-ingress.sh rename to applications/osmo/deploy/example/002-setup/03-deploy-nginx-ingress.sh diff --git a/applications/osmo/deploy/002-setup/04-deploy-osmo-control-plane.sh b/applications/osmo/deploy/example/002-setup/04-deploy-osmo-control-plane.sh similarity index 100% rename from applications/osmo/deploy/002-setup/04-deploy-osmo-control-plane.sh rename to applications/osmo/deploy/example/002-setup/04-deploy-osmo-control-plane.sh diff --git a/applications/osmo/deploy/002-setup/04-enable-tls.sh b/applications/osmo/deploy/example/002-setup/04-enable-tls.sh similarity index 100% rename from applications/osmo/deploy/002-setup/04-enable-tls.sh rename to applications/osmo/deploy/example/002-setup/04-enable-tls.sh diff --git a/applications/osmo/deploy/002-setup/05-deploy-osmo-backend.sh b/applications/osmo/deploy/example/002-setup/05-deploy-osmo-backend.sh similarity index 100% rename from applications/osmo/deploy/002-setup/05-deploy-osmo-backend.sh rename to applications/osmo/deploy/example/002-setup/05-deploy-osmo-backend.sh diff --git a/applications/osmo/deploy/002-setup/06-configure-storage.sh b/applications/osmo/deploy/example/002-setup/06-configure-storage.sh similarity index 100% rename from applications/osmo/deploy/002-setup/06-configure-storage.sh rename to applications/osmo/deploy/example/002-setup/06-configure-storage.sh diff --git a/applications/osmo/deploy/002-setup/07-configure-service-url.sh b/applications/osmo/deploy/example/002-setup/07-configure-service-url.sh similarity index 100% rename from applications/osmo/deploy/002-setup/07-configure-service-url.sh rename to applications/osmo/deploy/example/002-setup/07-configure-service-url.sh diff --git a/applications/osmo/deploy/002-setup/08-configure-gpu-platform.sh b/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh similarity index 100% rename from applications/osmo/deploy/002-setup/08-configure-gpu-platform.sh rename to applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh diff --git a/applications/osmo/deploy/002-setup/09-configure-backend-scheduler.sh b/applications/osmo/deploy/example/002-setup/09-configure-backend-scheduler.sh similarity index 100% rename from applications/osmo/deploy/002-setup/09-configure-backend-scheduler.sh rename to applications/osmo/deploy/example/002-setup/09-configure-backend-scheduler.sh diff --git a/applications/osmo/deploy/002-setup/10-configure-dataset-bucket.sh b/applications/osmo/deploy/example/002-setup/10-configure-dataset-bucket.sh similarity index 100% rename from applications/osmo/deploy/002-setup/10-configure-dataset-bucket.sh rename to applications/osmo/deploy/example/002-setup/10-configure-dataset-bucket.sh diff --git a/applications/osmo/deploy/002-setup/README.md b/applications/osmo/deploy/example/002-setup/README.md similarity index 100% rename from applications/osmo/deploy/002-setup/README.md rename to applications/osmo/deploy/example/002-setup/README.md diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-gpu-infrastructure.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-gpu-infrastructure.sh similarity index 100% rename from applications/osmo/deploy/002-setup/cleanup/uninstall-gpu-infrastructure.sh rename to applications/osmo/deploy/example/002-setup/cleanup/uninstall-gpu-infrastructure.sh diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-keycloak.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-keycloak.sh similarity index 100% rename from applications/osmo/deploy/002-setup/cleanup/uninstall-keycloak.sh rename to applications/osmo/deploy/example/002-setup/cleanup/uninstall-keycloak.sh diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-nginx-ingress.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-nginx-ingress.sh similarity index 100% rename from applications/osmo/deploy/002-setup/cleanup/uninstall-nginx-ingress.sh rename to applications/osmo/deploy/example/002-setup/cleanup/uninstall-nginx-ingress.sh diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-observability.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-observability.sh similarity index 100% rename from applications/osmo/deploy/002-setup/cleanup/uninstall-observability.sh rename to applications/osmo/deploy/example/002-setup/cleanup/uninstall-observability.sh diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-backend.sh similarity index 100% rename from applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-backend.sh rename to applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-backend.sh diff --git a/applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-control-plane.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-control-plane.sh similarity index 100% rename from applications/osmo/deploy/002-setup/cleanup/uninstall-osmo-control-plane.sh rename to applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-control-plane.sh diff --git a/applications/osmo/deploy/002-setup/defaults.sh b/applications/osmo/deploy/example/002-setup/defaults.sh similarity index 100% rename from applications/osmo/deploy/002-setup/defaults.sh rename to applications/osmo/deploy/example/002-setup/defaults.sh diff --git a/applications/osmo/deploy/002-setup/gpu_platform_update.json b/applications/osmo/deploy/example/002-setup/gpu_platform_update.json similarity index 100% rename from applications/osmo/deploy/002-setup/gpu_platform_update.json rename to applications/osmo/deploy/example/002-setup/gpu_platform_update.json diff --git a/applications/osmo/deploy/002-setup/gpu_pod_template.json b/applications/osmo/deploy/example/002-setup/gpu_pod_template.json similarity index 100% rename from applications/osmo/deploy/002-setup/gpu_pod_template.json rename to applications/osmo/deploy/example/002-setup/gpu_pod_template.json diff --git a/applications/osmo/deploy/002-setup/lib/common.sh b/applications/osmo/deploy/example/002-setup/lib/common.sh similarity index 100% rename from applications/osmo/deploy/002-setup/lib/common.sh rename to applications/osmo/deploy/example/002-setup/lib/common.sh diff --git a/applications/osmo/deploy/002-setup/osmo-values-noauth.yaml b/applications/osmo/deploy/example/002-setup/osmo-values-noauth.yaml similarity index 100% rename from applications/osmo/deploy/002-setup/osmo-values-noauth.yaml rename to applications/osmo/deploy/example/002-setup/osmo-values-noauth.yaml diff --git a/applications/osmo/deploy/002-setup/sample_osmo_realm.json b/applications/osmo/deploy/example/002-setup/sample_osmo_realm.json similarity index 100% rename from applications/osmo/deploy/002-setup/sample_osmo_realm.json rename to applications/osmo/deploy/example/002-setup/sample_osmo_realm.json diff --git a/applications/osmo/deploy/002-setup/values/gpu-operator.yaml b/applications/osmo/deploy/example/002-setup/values/gpu-operator.yaml similarity index 100% rename from applications/osmo/deploy/002-setup/values/gpu-operator.yaml rename to applications/osmo/deploy/example/002-setup/values/gpu-operator.yaml diff --git a/applications/osmo/deploy/002-setup/values/grafana.yaml b/applications/osmo/deploy/example/002-setup/values/grafana.yaml similarity index 100% rename from applications/osmo/deploy/002-setup/values/grafana.yaml rename to applications/osmo/deploy/example/002-setup/values/grafana.yaml diff --git a/applications/osmo/deploy/002-setup/values/kai-scheduler.yaml b/applications/osmo/deploy/example/002-setup/values/kai-scheduler.yaml similarity index 100% rename from applications/osmo/deploy/002-setup/values/kai-scheduler.yaml rename to applications/osmo/deploy/example/002-setup/values/kai-scheduler.yaml diff --git a/applications/osmo/deploy/002-setup/values/loki.yaml b/applications/osmo/deploy/example/002-setup/values/loki.yaml similarity index 100% rename from applications/osmo/deploy/002-setup/values/loki.yaml rename to applications/osmo/deploy/example/002-setup/values/loki.yaml diff --git a/applications/osmo/deploy/002-setup/values/network-operator.yaml b/applications/osmo/deploy/example/002-setup/values/network-operator.yaml similarity index 100% rename from applications/osmo/deploy/002-setup/values/network-operator.yaml rename to applications/osmo/deploy/example/002-setup/values/network-operator.yaml diff --git a/applications/osmo/deploy/002-setup/values/osmo-backend-operator.yaml b/applications/osmo/deploy/example/002-setup/values/osmo-backend-operator.yaml similarity index 100% rename from applications/osmo/deploy/002-setup/values/osmo-backend-operator.yaml rename to applications/osmo/deploy/example/002-setup/values/osmo-backend-operator.yaml diff --git a/applications/osmo/deploy/002-setup/values/prometheus.yaml b/applications/osmo/deploy/example/002-setup/values/prometheus.yaml similarity index 100% rename from applications/osmo/deploy/002-setup/values/prometheus.yaml rename to applications/osmo/deploy/example/002-setup/values/prometheus.yaml diff --git a/applications/osmo/deploy/002-setup/values/promtail.yaml b/applications/osmo/deploy/example/002-setup/values/promtail.yaml similarity index 100% rename from applications/osmo/deploy/002-setup/values/promtail.yaml rename to applications/osmo/deploy/example/002-setup/values/promtail.yaml diff --git a/applications/osmo/deploy/README.md b/applications/osmo/deploy/example/README.md similarity index 100% rename from applications/osmo/deploy/README.md rename to applications/osmo/deploy/example/README.md From 538e809aa85ab44637ac477433ca1d073a75cfd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Fri, 20 Feb 2026 08:35:38 +0100 Subject: [PATCH 24/37] - add defaults to env init --- .../000-prerequisites/nebius-env-init.sh | 365 +++--------------- 1 file changed, 50 insertions(+), 315 deletions(-) diff --git a/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh b/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh index a9ad6a2e5..c64f6976a 100755 --- a/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh +++ b/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh @@ -1,19 +1,28 @@ #!/bin/bash # # Nebius Environment Initialization Script -# +# # This script sets up environment variables needed for Terraform deployment. # Run with: source ./nebius-env-init.sh # +# Configure your deployment by setting the values below. +# # NOTE: Do NOT use 'set -e' as this script is meant to be sourced # +# ======================================== +# CONFIGURATION - Set your values here +# ======================================== +NEBIUS_TENANT_ID="${NEBIUS_TENANT_ID:-}" # e.g. tenant-abc123def456 +NEBIUS_PROJECT_ID="${NEBIUS_PROJECT_ID:-}" # e.g. project-abc123def456 +NEBIUS_REGION="${NEBIUS_REGION:-eu-north1}" # eu-north1, eu-north2, eu-west1, me-west1, uk-south1, us-central1 +# ======================================== + # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' -CYAN='\033[0;36m' NC='\033[0m' echo "" @@ -27,11 +36,6 @@ is_wsl() { grep -qi microsoft /proc/version 2>/dev/null } -# Check if jq is installed -has_jq() { - command -v jq &>/dev/null -} - # Get Nebius CLI path get_nebius_path() { if command -v nebius &>/dev/null; then @@ -51,13 +55,13 @@ check_nebius_cli() { echo "Or manually: curl -sSL https://storage.eu-north1.nebius.cloud/nebius/install.sh | bash" return 1 fi - + # Add to PATH if needed if ! command -v nebius &>/dev/null && [[ -x "$HOME/.nebius/bin/nebius" ]]; then export PATH="$HOME/.nebius/bin:$PATH" echo -e "${YELLOW}[INFO]${NC} Added ~/.nebius/bin to PATH" fi - + return 0 } @@ -67,13 +71,13 @@ check_nebius_auth() { if [[ -z "$nebius_path" ]]; then return 1 fi - + # Clear potentially corrupted token if [[ -n "$NEBIUS_IAM_TOKEN" ]]; then echo -e "${YELLOW}[INFO]${NC} Clearing NEBIUS_IAM_TOKEN environment variable" unset NEBIUS_IAM_TOKEN fi - + # Test authentication by listing profiles if "$nebius_path" profile list &>/dev/null; then return 0 @@ -81,171 +85,6 @@ check_nebius_auth() { return 1 } -# Read input with a prompt into a variable (bash/zsh compatible). -read_prompt_var() { - local prompt=$1 - local var_name=$2 - local default=$3 - local value="" - local read_from="/dev/tty" - local write_to="/dev/tty" - - if [[ ! -r "/dev/tty" || ! -w "/dev/tty" ]]; then - read_from="/dev/stdin" - write_to="/dev/stdout" - fi - - if [[ -n "$default" ]]; then - printf "%s [%s]: " "$prompt" "$default" >"$write_to" - else - printf "%s: " "$prompt" >"$write_to" - fi - - IFS= read -r value <"$read_from" - if [[ -z "$value" && -n "$default" ]]; then - value="$default" - fi - - eval "$var_name='$value'" -} - -# Interactive prompt with default value -prompt_with_default() { - local prompt=$1 - local default=$2 - local var_name=$3 - - read_prompt_var "$prompt" "$var_name" "$default" -} - -# List existing projects in a tenant -list_projects() { - local tenant_id=$1 - local nebius_path=$(get_nebius_path) - - echo -e "${CYAN}Fetching existing projects...${NC}" - local projects=$("$nebius_path" iam project list --parent-id "$tenant_id" --format json 2>/dev/null) - - if [[ -z "$projects" || "$projects" == "[]" ]]; then - echo " No projects found in this tenant." - return 1 - fi - - echo "" - echo "Existing projects:" - echo "$projects" | jq -r '.[] | " - \(.metadata.name) (\(.metadata.id))"' 2>/dev/null || echo " (Could not parse projects)" - echo "" - return 0 -} - -# Create a new project -create_project() { - local tenant_id=$1 - local project_name=$2 - local nebius_path=$(get_nebius_path) - - echo -e "${BLUE}Creating new project: $project_name${NC}" - - if "$nebius_path" iam project create --parent-id "$tenant_id" --name "$project_name" 2>&1; then - echo -e "${GREEN}[✓]${NC} Project created successfully" - - # Get the project ID - local project_id=$("$nebius_path" iam project get-by-name --parent-id "$tenant_id" --name "$project_name" --format json 2>/dev/null | jq -r '.metadata.id') - - if [[ -n "$project_id" && "$project_id" != "null" ]]; then - echo " Project ID: $project_id" - echo "$project_id" - return 0 - fi - fi - - echo -e "${RED}[ERROR]${NC} Failed to create project" - return 1 -} - -# Get project ID by name -get_project_id_by_name() { - local tenant_id=$1 - local project_name=$2 - local nebius_path=$(get_nebius_path) - - "$nebius_path" iam project get-by-name --parent-id "$tenant_id" --name "$project_name" --format json 2>/dev/null | jq -r '.metadata.id' -} - -# Interactive project selection/creation -select_or_create_project() { - local tenant_id=$1 - local nebius_path=$(get_nebius_path) - - echo "" - echo -e "${BLUE}Project Configuration${NC}" - echo "" - echo "Options:" - echo " 1) Use existing project (enter project ID)" - echo " 2) Create new project" - echo " 3) List existing projects first" - echo "" - - local choice - read_prompt_var "Choose option [1/2/3]" choice "" - - case $choice in - 1) - read_prompt_var "Enter Project ID" NEBIUS_PROJECT_ID "" - ;; - 2) - local project_name - read_prompt_var "Enter new project name" project_name "" - - if [[ -z "$project_name" ]]; then - echo -e "${RED}[ERROR]${NC} Project name cannot be empty" - return 1 - fi - - # Check if project already exists - local existing_id=$(get_project_id_by_name "$tenant_id" "$project_name") - if [[ -n "$existing_id" && "$existing_id" != "null" ]]; then - echo -e "${YELLOW}[INFO]${NC} Project '$project_name' already exists" - echo " Using existing project ID: $existing_id" - NEBIUS_PROJECT_ID="$existing_id" - else - NEBIUS_PROJECT_ID=$(create_project "$tenant_id" "$project_name") - if [[ $? -ne 0 || -z "$NEBIUS_PROJECT_ID" ]]; then - return 1 - fi - fi - ;; - 3) - list_projects "$tenant_id" - echo "" - read_prompt_var "Enter Project ID from the list above (or 'new' to create)" input "" - - if [[ "$input" == "new" ]]; then - local project_name - read_prompt_var "Enter new project name" project_name "" - - if [[ -z "$project_name" ]]; then - echo -e "${RED}[ERROR]${NC} Project name cannot be empty" - return 1 - fi - - NEBIUS_PROJECT_ID=$(create_project "$tenant_id" "$project_name") - if [[ $? -ne 0 || -z "$NEBIUS_PROJECT_ID" ]]; then - return 1 - fi - else - NEBIUS_PROJECT_ID="$input" - fi - ;; - *) - echo -e "${RED}[ERROR]${NC} Invalid option" - return 1 - ;; - esac - - return 0 -} - # Main initialization main() { # Step 1: Check Nebius CLI @@ -255,7 +94,7 @@ main() { fi echo -e "${GREEN}[✓]${NC} Nebius CLI found" echo "" - + # Step 2: Check authentication echo -e "${BLUE}Step 2: Checking authentication${NC}" if ! check_nebius_auth; then @@ -279,172 +118,68 @@ main() { fi echo -e "${GREEN}[✓]${NC} Nebius CLI authenticated" echo "" - - # Step 3: Configure deployment settings - echo -e "${BLUE}Step 3: Configure deployment settings${NC}" - - local nebius_path=$(get_nebius_path) - - # Check for existing environment variables or use defaults - local current_tenant="${NEBIUS_TENANT_ID:-}" - local current_project="${NEBIUS_PROJECT_ID:-}" - local current_region="${NEBIUS_REGION:-eu-north1}" - - # Sanitize previously set values in case they were corrupted by a failed prompt - if [[ -n "$current_tenant" && ! "$current_tenant" =~ ^tenant-[a-z0-9]+$ ]]; then - current_tenant="" - fi - if [[ -n "$current_project" && ! "$current_project" =~ ^project-[a-z0-9]+$ ]]; then - current_project="" - fi - echo "" - - # Tenant ID - if [[ -z "$current_tenant" ]]; then - echo "Tenant ID is required. Find it in the Nebius Console under IAM > Tenants" - echo "" - read_prompt_var "List available tenants? (y/N)" list_tenants "" - if [[ "$list_tenants" =~ ^[yY]$ ]]; then - echo "" - echo "Fetching available tenants..." - local tenants=$("$nebius_path" iam tenant list --format json 2>/dev/null) - if [[ -n "$tenants" && "$tenants" != "[]" ]]; then - echo "" - echo "Available tenants:" - if has_jq; then - local page_token="" - local total_count=0 - local last_tenant_id="" - while :; do - if [[ -n "$page_token" ]]; then - tenants=$("$nebius_path" iam tenant list --format json --page-token "$page_token" 2>/dev/null) - else - tenants=$("$nebius_path" iam tenant list --format json 2>/dev/null) - fi - - echo "$tenants" | jq -r '.items // . | map(select(.metadata.name | startswith("billing-test") | not)) | .[] | " - \(.metadata.name): \(.metadata.id)"' 2>/dev/null || true - local page_count - page_count=$(echo "$tenants" | jq -r '(.items // .) | map(select(.metadata.name | startswith("billing-test") | not)) | length' 2>/dev/null || echo "0") - total_count=$((total_count + page_count)) - if [[ "$page_count" -gt 0 ]]; then - last_tenant_id=$(echo "$tenants" | jq -r '(.items // .) | map(select(.metadata.name | startswith("billing-test") | not)) | .[-1].metadata.id' 2>/dev/null) - fi - - page_token=$(echo "$tenants" | jq -r '.next_page_token // empty' 2>/dev/null) - if [[ -z "$page_token" ]]; then - break - fi - done - - # Auto-detect if only one tenant across all pages - if [[ "$total_count" == "1" ]]; then - current_tenant="$last_tenant_id" - echo -e "${GREEN}[✓]${NC} Auto-detected tenant: $current_tenant" - fi - else - echo " (jq not found; run 'brew install jq' to show tenants)" - fi - else - echo " No tenants found." - fi - echo "" - fi + # Step 3: Validate configuration + echo -e "${BLUE}Step 3: Validating configuration${NC}" - prompt_with_default "Enter Tenant ID" "$current_tenant" "NEBIUS_TENANT_ID" - else - prompt_with_default "Tenant ID" "$current_tenant" "NEBIUS_TENANT_ID" - fi - - # Validate tenant ID if [[ -z "$NEBIUS_TENANT_ID" ]]; then - echo -e "${RED}[ERROR]${NC} Tenant ID is required!" + echo -e "${RED}[ERROR]${NC} NEBIUS_TENANT_ID is not set." + echo " Edit the CONFIGURATION section at the top of this script." return 1 fi - - # Project ID - with option to create - if [[ -z "$current_project" ]]; then - echo "" - echo "No project configured. You can use an existing project or create a new one." - if ! select_or_create_project "$NEBIUS_TENANT_ID"; then - return 1 - fi - else - echo "" - echo "Current project: $current_project" - read_prompt_var "Use this project? (Y/n/new)" use_current "" - - case $use_current in - n|N) - if ! select_or_create_project "$NEBIUS_TENANT_ID"; then - return 1 - fi - ;; - new) - local project_name - read_prompt_var "Enter new project name" project_name "" - NEBIUS_PROJECT_ID=$(create_project "$NEBIUS_TENANT_ID" "$project_name") - if [[ $? -ne 0 || -z "$NEBIUS_PROJECT_ID" ]]; then - return 1 - fi - ;; - *) - NEBIUS_PROJECT_ID="$current_project" - ;; - esac + + if [[ ! "$NEBIUS_TENANT_ID" =~ ^tenant-[a-z0-9]+ ]]; then + echo -e "${RED}[ERROR]${NC} Invalid tenant ID format: '$NEBIUS_TENANT_ID'" + echo " Tenant IDs should look like: tenant-e00abc123def456" + return 1 fi - - # Validate project ID format + if [[ -z "$NEBIUS_PROJECT_ID" ]]; then - echo -e "${RED}[ERROR]${NC} Project ID is required!" + echo -e "${RED}[ERROR]${NC} NEBIUS_PROJECT_ID is not set." + echo " Edit the CONFIGURATION section at the top of this script." return 1 fi - - # Check if project ID looks valid (should start with 'project-') + if [[ ! "$NEBIUS_PROJECT_ID" =~ ^project-[a-z0-9]+ ]]; then echo -e "${RED}[ERROR]${NC} Invalid project ID format: '$NEBIUS_PROJECT_ID'" echo " Project IDs should look like: project-e00abc123def456" - echo "" - echo " Run this to list your projects:" - echo " nebius iam project list --parent-id $NEBIUS_TENANT_ID" return 1 fi - - # Region + + if [[ -z "$NEBIUS_REGION" ]]; then + echo -e "${RED}[ERROR]${NC} NEBIUS_REGION is not set." + echo " Edit the CONFIGURATION section at the top of this script." + return 1 + fi + + echo -e "${GREEN}[✓]${NC} Configuration valid" echo "" - echo "Available regions:" - echo " - eu-north1 (Finland - H100, H200, L40S)" - echo " - eu-north2 (H200)" - echo " - eu-west1 (H200)" - echo " - me-west1 (B200)" - echo " - uk-south1 (B300)" - echo " - us-central1 (H200, B200)" - prompt_with_default "Region" "${current_region:-eu-north1}" "NEBIUS_REGION" - + # Step 4: Export environment variables - echo "" echo -e "${BLUE}Step 4: Setting environment variables${NC}" - + + local nebius_path=$(get_nebius_path) + export NEBIUS_TENANT_ID export NEBIUS_PROJECT_ID export NEBIUS_REGION - + # Get IAM token for Terraform provider authentication echo "Getting IAM token for Terraform..." unset NEBIUS_IAM_TOKEN # Clear any old/corrupted token export NEBIUS_IAM_TOKEN=$("$nebius_path" iam get-access-token) - + if [[ -z "$NEBIUS_IAM_TOKEN" ]]; then echo -e "${RED}[ERROR]${NC} Failed to get IAM token" return 1 fi echo -e "${GREEN}[✓]${NC} IAM token obtained" - + # Terraform variables export TF_VAR_tenant_id="$NEBIUS_TENANT_ID" export TF_VAR_parent_id="$NEBIUS_PROJECT_ID" export TF_VAR_region="$NEBIUS_REGION" - + echo "" echo -e "${GREEN}[✓]${NC} Environment variables set:" echo " NEBIUS_TENANT_ID = $NEBIUS_TENANT_ID" @@ -454,17 +189,17 @@ main() { echo " TF_VAR_tenant_id = $TF_VAR_tenant_id" echo " TF_VAR_parent_id = $TF_VAR_parent_id" echo " TF_VAR_region = $TF_VAR_region" - + # Step 5: Verify connectivity echo "" echo -e "${BLUE}Step 5: Verifying connectivity${NC}" - + if "$nebius_path" iam project get --id "$NEBIUS_PROJECT_ID" &>/dev/null; then echo -e "${GREEN}[✓]${NC} Successfully connected to Nebius project" else echo -e "${YELLOW}[!]${NC} Could not verify project access (this may be normal for new projects)" fi - + echo "" echo "========================================" echo -e "${GREEN}Environment initialization complete!${NC}" @@ -476,9 +211,9 @@ main() { echo " 3. cp terraform.tfvars.cost-optimized-secure.example terraform.tfvars" echo " 4. terraform init && terraform apply" echo "" - + return 0 } # Run main function -main +main \ No newline at end of file From 4f411e4ae1f5d8ee48600b811f9ffb91e3f01daa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Fri, 20 Feb 2026 08:47:36 +0100 Subject: [PATCH 25/37] - add reservations --- applications/osmo/deploy/example/001-iac/main.tf | 1 + .../osmo/deploy/example/001-iac/modules/k8s/main.tf | 6 ++++++ .../osmo/deploy/example/001-iac/modules/k8s/variables.tf | 6 ++++++ applications/osmo/deploy/example/001-iac/variables.tf | 6 ++++++ 4 files changed, 19 insertions(+) diff --git a/applications/osmo/deploy/example/001-iac/main.tf b/applications/osmo/deploy/example/001-iac/main.tf index ae471c25d..a55029b20 100755 --- a/applications/osmo/deploy/example/001-iac/main.tf +++ b/applications/osmo/deploy/example/001-iac/main.tf @@ -91,6 +91,7 @@ module "k8s" { gpu_nodes_preemptible = var.gpu_nodes_preemptible gpu_nodes_driverfull_image = var.gpu_nodes_driverfull_image gpu_drivers_preset = local.gpu_drivers_preset + gpu_reservation_ids = var.gpu_reservation_ids # Filestore enable_filestore = var.enable_filestore diff --git a/applications/osmo/deploy/example/001-iac/modules/k8s/main.tf b/applications/osmo/deploy/example/001-iac/modules/k8s/main.tf index e913c0d87..6ebf374d9 100755 --- a/applications/osmo/deploy/example/001-iac/modules/k8s/main.tf +++ b/applications/osmo/deploy/example/001-iac/modules/k8s/main.tf @@ -145,6 +145,12 @@ resource "nebius_mk8s_v1_node_group" "gpu" { # Driverfull images (pre-installed NVIDIA drivers, no GPU Operator driver needed) gpu_settings = var.gpu_nodes_driverfull_image ? { drivers_preset = var.gpu_drivers_preset } : null + # Reservation policy for capacity block groups + reservation_policy = length(var.gpu_reservation_ids) > 0 ? { + policy = "STRICT" + reservation_ids = var.gpu_reservation_ids + } : null + # Preemptible configuration preemptible = var.gpu_nodes_preemptible ? { on_preemption = "STOP" diff --git a/applications/osmo/deploy/example/001-iac/modules/k8s/variables.tf b/applications/osmo/deploy/example/001-iac/modules/k8s/variables.tf index b2662db58..9e10cf9d1 100755 --- a/applications/osmo/deploy/example/001-iac/modules/k8s/variables.tf +++ b/applications/osmo/deploy/example/001-iac/modules/k8s/variables.tf @@ -175,6 +175,12 @@ variable "gpu_nodes_preemptible" { default = false } +variable "gpu_reservation_ids" { + description = "List of capacity block group IDs for GPU reservations" + type = list(string) + default = [] +} + variable "gpu_nodes_driverfull_image" { description = "Use Nebius driverfull images with pre-installed NVIDIA drivers" type = bool diff --git a/applications/osmo/deploy/example/001-iac/variables.tf b/applications/osmo/deploy/example/001-iac/variables.tf index 39db0992a..3c7548582 100755 --- a/applications/osmo/deploy/example/001-iac/variables.tf +++ b/applications/osmo/deploy/example/001-iac/variables.tf @@ -221,6 +221,12 @@ variable "gpu_nodes_preemptible" { default = false } +variable "gpu_reservation_ids" { + description = "List of capacity block group IDs for GPU reservations (e.g. [\"capacityblockgroup-e00xxxxx\"]). When set, reservation_policy is STRICT." + type = list(string) + default = [] +} + variable "gpu_nodes_driverfull_image" { description = "Use Nebius driverfull images (pre-installed NVIDIA drivers). When true, GPU Operator driver installation is not needed." type = bool From 69e0ec8316625e267f8d346cb32a389b490a4ca8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Fri, 20 Feb 2026 10:12:45 +0100 Subject: [PATCH 26/37] - fix networking --- .../000-prerequisites/nebius-env-init.sh | 65 ++++++++++++++++--- .../osmo/deploy/example/001-iac/main.tf | 10 +-- .../example/001-iac/modules/platform/main.tf | 20 +----- .../001-iac/modules/platform/outputs.tf | 4 +- .../001-iac/modules/platform/variables.tf | 12 ++-- .../001-iac/modules/wireguard/variables.tf | 3 +- ...aform.tfvars.cost-optimized-secure.example | 6 -- .../terraform.tfvars.cost-optimized.example | 6 -- .../terraform.tfvars.production.example | 6 -- .../001-iac/terraform.tfvars.secure.example | 6 -- .../osmo/deploy/example/001-iac/variables.tf | 16 ++--- 11 files changed, 83 insertions(+), 71 deletions(-) diff --git a/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh b/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh index c64f6976a..cd52bc201 100755 --- a/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh +++ b/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh @@ -180,19 +180,53 @@ main() { export TF_VAR_parent_id="$NEBIUS_PROJECT_ID" export TF_VAR_region="$NEBIUS_REGION" + echo -e "${GREEN}[✓]${NC} Core environment variables set" + + # Step 5: Discover default network and subnet echo "" - echo -e "${GREEN}[✓]${NC} Environment variables set:" - echo " NEBIUS_TENANT_ID = $NEBIUS_TENANT_ID" - echo " NEBIUS_PROJECT_ID = $NEBIUS_PROJECT_ID" - echo " NEBIUS_REGION = $NEBIUS_REGION" - echo " NEBIUS_IAM_TOKEN = ${NEBIUS_IAM_TOKEN:0:20}... (truncated)" - echo " TF_VAR_tenant_id = $TF_VAR_tenant_id" - echo " TF_VAR_parent_id = $TF_VAR_parent_id" - echo " TF_VAR_region = $TF_VAR_region" + echo -e "${BLUE}Step 5: Discovering default network and subnet${NC}" + + local network_json subnet_json + network_json=$("$nebius_path" vpc v1 network list --parent-id "$NEBIUS_PROJECT_ID" --format json 2>/dev/null) + + local network_id network_name subnet_id subnet_name + + if [[ -n "$network_json" ]]; then + network_id=$(echo "$network_json" | jq -r '(.items // .) | map(select(.metadata.name | startswith("default"))) | .[0].metadata.id // empty' 2>/dev/null) + network_name=$(echo "$network_json" | jq -r '(.items // .) | map(select(.metadata.name | startswith("default"))) | .[0].metadata.name // empty' 2>/dev/null) + fi + + if [[ -z "$network_id" ]]; then + echo -e "${RED}[ERROR]${NC} No default network found in project $NEBIUS_PROJECT_ID" + echo " Expected a network whose name starts with 'default'." + return 1 + fi + + echo -e "${GREEN}[✓]${NC} Found network: $network_name ($network_id)" + + subnet_json=$("$nebius_path" vpc v1 subnet list --parent-id "$NEBIUS_PROJECT_ID" --format json 2>/dev/null) - # Step 5: Verify connectivity + if [[ -n "$subnet_json" ]]; then + subnet_id=$(echo "$subnet_json" | jq -r '(.items // .) | map(select(.metadata.name | startswith("default"))) | .[0].metadata.id // empty' 2>/dev/null) + subnet_name=$(echo "$subnet_json" | jq -r '(.items // .) | map(select(.metadata.name | startswith("default"))) | .[0].metadata.name // empty' 2>/dev/null) + fi + + if [[ -z "$subnet_id" ]]; then + echo -e "${RED}[ERROR]${NC} No default subnet found in project $NEBIUS_PROJECT_ID" + echo " Expected a subnet whose name starts with 'default'." + return 1 + fi + + echo -e "${GREEN}[✓]${NC} Found subnet: $subnet_name ($subnet_id)" + + export NEBIUS_NETWORK_ID="$network_id" + export NEBIUS_SUBNET_ID="$subnet_id" + export TF_VAR_network_id="$network_id" + export TF_VAR_subnet_id="$subnet_id" + + # Step 6: Verify connectivity echo "" - echo -e "${BLUE}Step 5: Verifying connectivity${NC}" + echo -e "${BLUE}Step 6: Verifying connectivity${NC}" if "$nebius_path" iam project get --id "$NEBIUS_PROJECT_ID" &>/dev/null; then echo -e "${GREEN}[✓]${NC} Successfully connected to Nebius project" @@ -205,6 +239,17 @@ main() { echo -e "${GREEN}Environment initialization complete!${NC}" echo "========================================" echo "" + echo -e "${GREEN}[✓]${NC} Environment variables set:" + echo " NEBIUS_TENANT_ID = $NEBIUS_TENANT_ID" + echo " NEBIUS_PROJECT_ID = $NEBIUS_PROJECT_ID" + echo " NEBIUS_REGION = $NEBIUS_REGION" + echo " NEBIUS_IAM_TOKEN = ${NEBIUS_IAM_TOKEN:0:20}... (truncated)" + echo " NEBIUS_NETWORK_ID = $NEBIUS_NETWORK_ID" + echo " NEBIUS_SUBNET_ID = $NEBIUS_SUBNET_ID" + echo "" + echo " Network: $network_name ($network_id)" + echo " Subnet: $subnet_name ($subnet_id)" + echo "" echo "Next steps:" echo " 1. source ./secrets-init.sh # Initialize MysteryBox secrets (recommended)" echo " 2. cd ../001-iac" diff --git a/applications/osmo/deploy/example/001-iac/main.tf b/applications/osmo/deploy/example/001-iac/main.tf index a55029b20..9e066ebec 100755 --- a/applications/osmo/deploy/example/001-iac/main.tf +++ b/applications/osmo/deploy/example/001-iac/main.tf @@ -13,8 +13,9 @@ module "platform" { region = var.region name_prefix = local.name_prefix - # Network - vpc_cidr = var.vpc_cidr + # Network (existing default network and subnet) + network_id = var.network_id + subnet_id = var.subnet_id # Storage storage_bucket_name = local.storage_bucket_name @@ -58,7 +59,7 @@ module "k8s" { name_prefix = local.name_prefix # Network - subnet_id = module.platform.subnet_id + subnet_id = var.subnet_id # Cluster config k8s_version = var.k8s_version @@ -116,8 +117,7 @@ module "wireguard" { name_prefix = local.name_prefix # Network - subnet_id = module.platform.subnet_id - vpc_cidr = var.vpc_cidr + subnet_id = var.subnet_id wg_network = var.wireguard_network # Instance config diff --git a/applications/osmo/deploy/example/001-iac/modules/platform/main.tf b/applications/osmo/deploy/example/001-iac/modules/platform/main.tf index 0b75c3af9..10c4758bc 100755 --- a/applications/osmo/deploy/example/001-iac/modules/platform/main.tf +++ b/applications/osmo/deploy/example/001-iac/modules/platform/main.tf @@ -3,23 +3,9 @@ # ============================================================================= # ----------------------------------------------------------------------------- -# VPC Network +# VPC Network (uses existing default network and subnet from the project) +# Set via nebius-env-init.sh -> TF_VAR_network_id / TF_VAR_subnet_id # ----------------------------------------------------------------------------- -resource "nebius_vpc_v1_network" "main" { - parent_id = var.parent_id - name = "${var.name_prefix}-network" -} - -resource "nebius_vpc_v1_subnet" "main" { - parent_id = var.parent_id - name = "${var.name_prefix}-subnet" - network_id = nebius_vpc_v1_network.main.id - - # Use network's default pools - more reliable across regions - ipv4_private_pools = { - use_network_pools = true - } -} # ----------------------------------------------------------------------------- # Service Account for Storage @@ -166,7 +152,7 @@ resource "nebius_msp_postgresql_v1alpha1_cluster" "main" { count = var.enable_managed_postgresql ? 1 : 0 parent_id = var.parent_id name = "${var.name_prefix}-postgresql" - network_id = nebius_vpc_v1_network.main.id + network_id = var.network_id config = { version = var.postgresql_version diff --git a/applications/osmo/deploy/example/001-iac/modules/platform/outputs.tf b/applications/osmo/deploy/example/001-iac/modules/platform/outputs.tf index 0d54886bc..47225611f 100755 --- a/applications/osmo/deploy/example/001-iac/modules/platform/outputs.tf +++ b/applications/osmo/deploy/example/001-iac/modules/platform/outputs.tf @@ -7,12 +7,12 @@ # ----------------------------------------------------------------------------- output "network_id" { description = "VPC network ID" - value = nebius_vpc_v1_network.main.id + value = var.network_id } output "subnet_id" { description = "VPC subnet ID" - value = nebius_vpc_v1_subnet.main.id + value = var.subnet_id } # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/example/001-iac/modules/platform/variables.tf b/applications/osmo/deploy/example/001-iac/modules/platform/variables.tf index 444aea81d..5d12973f1 100755 --- a/applications/osmo/deploy/example/001-iac/modules/platform/variables.tf +++ b/applications/osmo/deploy/example/001-iac/modules/platform/variables.tf @@ -23,13 +23,17 @@ variable "name_prefix" { } # ----------------------------------------------------------------------------- -# Network Configuration +# Network Configuration (existing default network and subnet) # ----------------------------------------------------------------------------- -variable "vpc_cidr" { - description = "CIDR block for VPC subnet" +variable "network_id" { + description = "Existing VPC network ID (set by nebius-env-init.sh)" + type = string +} + +variable "subnet_id" { + description = "Existing VPC subnet ID (set by nebius-env-init.sh)" type = string - default = "10.0.0.0/16" } # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/example/001-iac/modules/wireguard/variables.tf b/applications/osmo/deploy/example/001-iac/modules/wireguard/variables.tf index 96d904227..018bf0df8 100755 --- a/applications/osmo/deploy/example/001-iac/modules/wireguard/variables.tf +++ b/applications/osmo/deploy/example/001-iac/modules/wireguard/variables.tf @@ -27,8 +27,9 @@ variable "subnet_id" { } variable "vpc_cidr" { - description = "VPC CIDR for routing" + description = "VPC CIDR for routing (unused, kept for future use)" type = string + default = "" } variable "wg_network" { diff --git a/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized-secure.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized-secure.example index 8d4c88f1f..d4eecdb5a 100755 --- a/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized-secure.example +++ b/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized-secure.example @@ -25,12 +25,6 @@ region = "eu-north1" environment = "dev" project_name = "osmo" -# ----------------------------------------------------------------------------- -# Network Settings -# ----------------------------------------------------------------------------- -# Note: /16 may exhaust VPC pool capacity. Using /20 (4,096 addresses) instead. -vpc_cidr = "10.0.0.0/20" - # ----------------------------------------------------------------------------- # Kubernetes Cluster (PRIVATE - access via WireGuard) # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized.example index ca8d54e5c..5da602bef 100755 --- a/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized.example +++ b/applications/osmo/deploy/example/001-iac/terraform.tfvars.cost-optimized.example @@ -18,12 +18,6 @@ region = "eu-north1" environment = "dev" project_name = "osmo-dev" -# ----------------------------------------------------------------------------- -# Network Settings -# ----------------------------------------------------------------------------- -# Note: /16 may exhaust VPC pool capacity. Using /20 (4,096 addresses) instead. -vpc_cidr = "10.0.0.0/20" - # ----------------------------------------------------------------------------- # Kubernetes Cluster # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/example/001-iac/terraform.tfvars.production.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.production.example index 778da4f99..ee61b2716 100755 --- a/applications/osmo/deploy/example/001-iac/terraform.tfvars.production.example +++ b/applications/osmo/deploy/example/001-iac/terraform.tfvars.production.example @@ -18,12 +18,6 @@ region = "eu-north1" environment = "prod" project_name = "osmo-prod" -# ----------------------------------------------------------------------------- -# Network Settings -# ----------------------------------------------------------------------------- -# Note: /16 may exhaust VPC pool capacity. Using /20 (4,096 addresses) instead. -vpc_cidr = "10.0.0.0/20" - # ----------------------------------------------------------------------------- # Kubernetes Cluster # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/example/001-iac/terraform.tfvars.secure.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.secure.example index f74be9ea1..00d9eaf29 100755 --- a/applications/osmo/deploy/example/001-iac/terraform.tfvars.secure.example +++ b/applications/osmo/deploy/example/001-iac/terraform.tfvars.secure.example @@ -18,12 +18,6 @@ region = "eu-north1" environment = "staging" project_name = "osmo-secure" -# ----------------------------------------------------------------------------- -# Network Settings -# ----------------------------------------------------------------------------- -# Note: /16 may exhaust VPC pool capacity. Using /20 (4,096 addresses) instead. -vpc_cidr = "10.0.0.0/20" - # ----------------------------------------------------------------------------- # Kubernetes Cluster (PRIVATE ONLY) # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/example/001-iac/variables.tf b/applications/osmo/deploy/example/001-iac/variables.tf index 3c7548582..6a16845e2 100755 --- a/applications/osmo/deploy/example/001-iac/variables.tf +++ b/applications/osmo/deploy/example/001-iac/variables.tf @@ -41,18 +41,18 @@ variable "project_name" { } # ============================================================================= -# Network Configuration +# Network Configuration (existing default network and subnet) +# Set automatically by nebius-env-init.sh via TF_VAR_network_id / TF_VAR_subnet_id # ============================================================================= -variable "vpc_cidr" { - description = "CIDR block for VPC subnet (/20 recommended - /16 may exhaust pool)" +variable "network_id" { + description = "Existing VPC network ID (set by nebius-env-init.sh)" type = string - default = "10.0.0.0/20" +} - validation { - condition = can(cidrhost(var.vpc_cidr, 0)) - error_message = "VPC CIDR must be a valid CIDR block" - } +variable "subnet_id" { + description = "Existing VPC subnet ID (set by nebius-env-init.sh)" + type = string } # ============================================================================= From aae176a2d10c47ea4314def8330863ec3ee60b2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Fri, 20 Feb 2026 13:19:52 +0100 Subject: [PATCH 27/37] - zsh compability --- .../002-setup/01-deploy-gpu-infrastructure.sh | 2 +- .../002-setup/02-deploy-observability.sh | 2 +- .../002-setup/03-deploy-nginx-ingress.sh | 2 +- .../002-setup/04-deploy-osmo-control-plane.sh | 18 +- .../deploy/example/002-setup/04-enable-tls.sh | 231 ++++++++++++------ .../002-setup/05-deploy-osmo-backend.sh | 2 +- .../example/002-setup/06-configure-storage.sh | 2 +- .../002-setup/07-configure-service-url.sh | 2 +- .../002-setup/08-configure-gpu-platform.sh | 2 +- .../09-configure-backend-scheduler.sh | 2 +- .../002-setup/10-configure-dataset-bucket.sh | 6 +- .../cleanup/uninstall-gpu-infrastructure.sh | 2 +- .../002-setup/cleanup/uninstall-keycloak.sh | 2 +- .../cleanup/uninstall-nginx-ingress.sh | 2 +- .../cleanup/uninstall-observability.sh | 2 +- .../cleanup/uninstall-osmo-backend.sh | 2 +- .../cleanup/uninstall-osmo-control-plane.sh | 2 +- 17 files changed, 181 insertions(+), 102 deletions(-) diff --git a/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh b/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh index ac6289b7d..7e5d19679 100755 --- a/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh +++ b/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh @@ -5,7 +5,7 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" diff --git a/applications/osmo/deploy/example/002-setup/02-deploy-observability.sh b/applications/osmo/deploy/example/002-setup/02-deploy-observability.sh index cee09bac5..c77761c90 100755 --- a/applications/osmo/deploy/example/002-setup/02-deploy-observability.sh +++ b/applications/osmo/deploy/example/002-setup/02-deploy-observability.sh @@ -5,7 +5,7 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" diff --git a/applications/osmo/deploy/example/002-setup/03-deploy-nginx-ingress.sh b/applications/osmo/deploy/example/002-setup/03-deploy-nginx-ingress.sh index 5ecda68d3..87cfd5fde 100755 --- a/applications/osmo/deploy/example/002-setup/03-deploy-nginx-ingress.sh +++ b/applications/osmo/deploy/example/002-setup/03-deploy-nginx-ingress.sh @@ -14,7 +14,7 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" diff --git a/applications/osmo/deploy/example/002-setup/04-deploy-osmo-control-plane.sh b/applications/osmo/deploy/example/002-setup/04-deploy-osmo-control-plane.sh index 4ea8c6e71..2820377a7 100755 --- a/applications/osmo/deploy/example/002-setup/04-deploy-osmo-control-plane.sh +++ b/applications/osmo/deploy/example/002-setup/04-deploy-osmo-control-plane.sh @@ -138,11 +138,7 @@ else printf "Enter choice [1-${#VALID_REGIONS[@]}]: " read -r choice if [[ "$choice" =~ ^[0-9]+$ ]] && (( choice >= 1 && choice <= ${#VALID_REGIONS[@]} )); then - NEBIUS_SELECTED_REGION="${VALID_REGIONS[$choice]}" - # bash arrays are 0-based, zsh arrays are 1-based; adjust if needed - if [[ -z "$NEBIUS_SELECTED_REGION" ]]; then - NEBIUS_SELECTED_REGION="${VALID_REGIONS[$((choice - 1))]}" - fi + NEBIUS_SELECTED_REGION="${VALID_REGIONS[$((choice - 1))]}" break fi echo "Invalid selection. Please enter a number between 1 and ${#VALID_REGIONS[@]}." @@ -1743,9 +1739,9 @@ cat > /tmp/vault-patch.json << 'PATCH_EOF' PATCH_EOF # All OSMO deployments that need the vault-secrets volume for MEK -OSMO_DEPLOYMENTS="osmo-service osmo-worker osmo-agent osmo-logger osmo-delayed-job-monitor osmo-router" +OSMO_DEPLOYMENTS=(osmo-service osmo-worker osmo-agent osmo-logger osmo-delayed-job-monitor osmo-router) -for deploy in $OSMO_DEPLOYMENTS; do +for deploy in "${OSMO_DEPLOYMENTS[@]}"; do if kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" &>/dev/null; then # Check if vault-secrets volume already exists EXISTING_VOL=$(kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" \ @@ -1771,7 +1767,7 @@ rm -f /tmp/vault-patch.json # Wait for rollouts to complete log_info "Waiting for deployments to roll out with new configuration..." -for deploy in $OSMO_DEPLOYMENTS; do +for deploy in "${OSMO_DEPLOYMENTS[@]}"; do if kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" &>/dev/null; then kubectl rollout status deployment/$deploy -n "${OSMO_NAMESPACE}" --timeout=180s || \ log_warning " Timeout waiting for $deploy rollout" @@ -1792,9 +1788,9 @@ if [[ "$AUTH_ENABLED" == "true" ]]; then else log_info "Verifying service ports (Envoy disabled)..." - OSMO_SERVICES="osmo-service osmo-router osmo-logger osmo-agent" + OSMO_SERVICES=(osmo-service osmo-router osmo-logger osmo-agent) - for svc in $OSMO_SERVICES; do + for svc in "${OSMO_SERVICES[@]}"; do if kubectl get svc "$svc" -n "${OSMO_NAMESPACE}" &>/dev/null; then CURRENT_TARGET=$(kubectl get svc "$svc" -n "${OSMO_NAMESPACE}" \ -o jsonpath='{.spec.ports[0].targetPort}' 2>/dev/null || echo "") @@ -1822,7 +1818,7 @@ log_info "Verifying deployment configuration..." # Verify vault-secrets volumes are mounted echo "" echo "Volume configuration verification:" -for deploy in $OSMO_DEPLOYMENTS; do +for deploy in "${OSMO_DEPLOYMENTS[@]}"; do if kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" &>/dev/null; then VOL_CHECK=$(kubectl get deployment/$deploy -n "${OSMO_NAMESPACE}" \ -o jsonpath='{.spec.template.spec.volumes[*].name}' 2>/dev/null | grep -w "vault-secrets" || echo "") diff --git a/applications/osmo/deploy/example/002-setup/04-enable-tls.sh b/applications/osmo/deploy/example/002-setup/04-enable-tls.sh index 799811d0f..f1a569a27 100755 --- a/applications/osmo/deploy/example/002-setup/04-enable-tls.sh +++ b/applications/osmo/deploy/example/002-setup/04-enable-tls.sh @@ -295,7 +295,8 @@ if [[ "$TLS_MODE" == "certbot" ]]; then echo " Certbot will run once per domain. Each requires a separate DNS TXT record." echo "" fi - read -r -p " Press Enter to continue (or Ctrl-C to abort)..." + printf " Press Enter to continue (or Ctrl-C to abort)..." + read -r echo "" # Process each domain @@ -415,42 +416,88 @@ spec: EOF log_success "ClusterIssuer created" + # ------------------------------------------------------------------------- + # Clean up any previous failed certificate attempts + # ------------------------------------------------------------------------- + log_info "Cleaning up previous certificate attempts (if any)..." + kubectl delete challenge --all -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + kubectl delete order --all -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + kubectl delete certificaterequest --all -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + kubectl delete certificate "${TLS_SECRET}" -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + kubectl delete secret "${TLS_SECRET}" -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + kubectl delete ingress osmo-tls-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + kubectl delete ingress osmo-tls-auth-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + # Clean up any lingering solver pods from previous attempts + kubectl delete pods -n "${OSMO_NS}" -l acme.cert-manager.io/http01-solver=true --ignore-not-found 2>/dev/null || true + + # ------------------------------------------------------------------------- + # Helper: wait for a certificate to become ready + # ------------------------------------------------------------------------- + wait_for_certificate() { + local cert_name="$1" + local max_wait="${2:-300}" + local interval=5 + local elapsed=0 + + log_info "Waiting for certificate '${cert_name}' (up to ${max_wait}s)..." + while [[ $elapsed -lt $max_wait ]]; do + local status + status=$(kubectl get certificate "${cert_name}" -n "${OSMO_NS}" \ + -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "") + if [[ "$status" == "True" ]]; then + log_success "Certificate '${cert_name}' issued and ready" + return 0 + fi + if (( elapsed > 0 && elapsed % 30 == 0 )); then + local challenge_state + challenge_state=$(kubectl get challenge -n "${OSMO_NS}" \ + -o jsonpath='{.items[0].status.state}' 2>/dev/null || echo "unknown") + log_info " Still waiting... (elapsed: ${elapsed}s, challenge: ${challenge_state})" + fi + sleep $interval + elapsed=$((elapsed + interval)) + done + + log_warning "Certificate '${cert_name}' not ready after ${max_wait}s" + kubectl describe certificate "${cert_name}" -n "${OSMO_NS}" 2>/dev/null | tail -15 + echo "" + echo "Debugging commands:" + echo " kubectl get certificate,certificaterequest,order,challenge -n ${OSMO_NS}" + echo " kubectl describe challenge -n ${OSMO_NS}" + return 1 + } + # ------------------------------------------------------------------------- # Issue TLS certificate for main domain + # + # When OSMO is already deployed (Mode B), the Envoy sidecar on OSMO + # services intercepts HTTP requests (including the ACME challenge path) + # and redirects them to Keycloak OAuth, which breaks Let's Encrypt. + # + # To work around this, we temporarily remove OSMO Ingress resources + # that have catch-all paths, create a clean bootstrap Ingress for the + # challenge, and restore everything with TLS once the cert is ready. # ------------------------------------------------------------------------- + REMOVED_INGRESSES=() + if [[ "$OSMO_DEPLOYED" == "true" ]]; then - # Mode B: Patch existing Ingress resources with TLS - log_info "Patching Ingress resources for TLS..." + log_info "Temporarily removing OSMO Ingress resources for certificate issuance..." + log_info "(Envoy sidecars intercept ACME challenges; we need a clean path)" + # Save and remove all OSMO ingresses to prevent Envoy from intercepting + mkdir -p /tmp/osmo-tls-backup for ing in $(kubectl get ingress -n "${OSMO_NS}" -o name 2>/dev/null); do ing_name="${ing#*/}" - CURRENT_HTTP=$(kubectl get "$ing" -n "${OSMO_NS}" -o jsonpath='{.spec.rules[0].http}') - - kubectl patch "$ing" -n "${OSMO_NS}" --type=merge -p "$(cat < "/tmp/osmo-tls-backup/${ing_name}.yaml" + kubectl delete "$ing" -n "${OSMO_NS}" 2>/dev/null || true + REMOVED_INGRESSES+=("$ing_name") + log_info " Removed ingress/${ing_name} (backed up)" done - else - # Mode A: Create a temporary Ingress to trigger HTTP-01 challenge - log_info "Creating temporary Ingress for certificate issuance..." - kubectl apply -f - </dev/null || echo "") - if [[ "$CERT_READY" == "True" ]]; then - log_success "TLS certificate issued and ready" - break - fi - sleep 5 - done - - if [[ "$CERT_READY" != "True" ]]; then - log_warning "Certificate not ready yet. Checking status..." - kubectl describe certificate "${TLS_SECRET}" -n "${OSMO_NS}" 2>/dev/null | tail -10 - echo "" - log_info "It may take a few more minutes. Check with:" - echo " kubectl get certificate -n ${OSMO_NS}" - echo " kubectl describe challenge -n ${OSMO_NS}" + CERT_READY="False" + if wait_for_certificate "${TLS_SECRET}" 300; then + CERT_READY="True" fi # Copy main cert secret to ingress namespace if needed copy_secret_across_namespaces "${TLS_SECRET}" + # Always clean up the bootstrap Ingress + log_info "Removing bootstrap ingress..." + kubectl delete ingress osmo-tls-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + # ------------------------------------------------------------------------- # Issue TLS certificate for Keycloak auth subdomain # ------------------------------------------------------------------------- + AUTH_CERT_READY="False" if [[ -n "$AUTH_HOSTNAME" ]]; then log_info "Issuing TLS certificate for Keycloak auth subdomain: ${AUTH_HOSTNAME}..." + # Clean up previous auth cert attempts + kubectl delete certificate "${KC_TLS_SECRET}" -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + kubectl delete secret "${KC_TLS_SECRET}" -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true + kubectl apply -f - </dev/null || echo "") - if [[ "$AUTH_CERT_READY" == "True" ]]; then - log_success "Auth TLS certificate issued and ready" - break - fi - sleep 5 - done - - if [[ "$AUTH_CERT_READY" != "True" ]]; then - log_warning "Auth certificate not ready yet. It may take a few more minutes." - log_info "Check with: kubectl get certificate ${KC_TLS_SECRET} -n ${OSMO_NS}" + if wait_for_certificate "${KC_TLS_SECRET}" 300; then + AUTH_CERT_READY="True" fi # Copy auth cert secret to ingress namespace if needed copy_secret_across_namespaces "${KC_TLS_SECRET}" - # Clean up bootstrap Ingress (prevents NGINX admission webhook conflicts) - log_info "Removing auth bootstrap ingress (certificate provisioned)..." - kubectl delete ingress osmo-tls-auth-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null + # Always clean up auth bootstrap Ingress + log_info "Removing auth bootstrap ingress..." + kubectl delete ingress osmo-tls-auth-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null || true fi - # Clean up main bootstrap Ingress - log_info "Removing main bootstrap ingress (certificate provisioned)..." - kubectl delete ingress osmo-tls-bootstrap -n "${OSMO_NS}" --ignore-not-found 2>/dev/null + # ------------------------------------------------------------------------- + # Restore OSMO Ingress resources with TLS (Mode B) + # ------------------------------------------------------------------------- + if [[ "$OSMO_DEPLOYED" == "true" && "$CERT_READY" == "True" ]]; then + log_info "Restoring OSMO Ingress resources with TLS..." + + for ing_name in "${REMOVED_INGRESSES[@]}"; do + backup_file="/tmp/osmo-tls-backup/${ing_name}.yaml" + [[ ! -f "$backup_file" ]] && continue + + # Determine which hostname/secret this ingress should use + local_host=$(yq -r '.spec.rules[0].host // ""' "$backup_file" 2>/dev/null || \ + python3 -c "import yaml,sys; d=yaml.safe_load(open('$backup_file')); print(d.get('spec',{}).get('rules',[{}])[0].get('host',''))" 2>/dev/null || echo "") + tls_secret_name="${TLS_SECRET}" + tls_host="${MAIN_HOSTNAME}" + if [[ "$local_host" == *"auth."* && -n "$AUTH_HOSTNAME" && "$AUTH_CERT_READY" == "True" ]]; then + tls_secret_name="${KC_TLS_SECRET}" + tls_host="${AUTH_HOSTNAME}" + fi + + # Re-apply the backup, then patch in TLS (no cert-manager annotation) + kubectl apply -f "$backup_file" 2>/dev/null || true + kubectl patch ingress "$ing_name" -n "${OSMO_NS}" --type=merge -p "$(cat </dev/null || true + log_info " ${ing_name}: restored (no TLS)" + done + rm -rf /tmp/osmo-tls-backup + log_info "Fix the certificate issue and re-run this script." + fi + + # ------------------------------------------------------------------------- + # Final cleanup: remove any lingering solver pods + # ------------------------------------------------------------------------- + kubectl delete pods -n "${OSMO_NS}" -l acme.cert-manager.io/http01-solver=true --ignore-not-found 2>/dev/null || true + kubectl delete pods -n "${INGRESS_NS}" -l acme.cert-manager.io/http01-solver=true --ignore-not-found 2>/dev/null || true fi # end TLS_MODE diff --git a/applications/osmo/deploy/example/002-setup/05-deploy-osmo-backend.sh b/applications/osmo/deploy/example/002-setup/05-deploy-osmo-backend.sh index 40a3cff5a..5f1d5767c 100755 --- a/applications/osmo/deploy/example/002-setup/05-deploy-osmo-backend.sh +++ b/applications/osmo/deploy/example/002-setup/05-deploy-osmo-backend.sh @@ -6,7 +6,7 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" diff --git a/applications/osmo/deploy/example/002-setup/06-configure-storage.sh b/applications/osmo/deploy/example/002-setup/06-configure-storage.sh index 754540fc1..f62352ca3 100755 --- a/applications/osmo/deploy/example/002-setup/06-configure-storage.sh +++ b/applications/osmo/deploy/example/002-setup/06-configure-storage.sh @@ -6,7 +6,7 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" diff --git a/applications/osmo/deploy/example/002-setup/07-configure-service-url.sh b/applications/osmo/deploy/example/002-setup/07-configure-service-url.sh index f4781e718..4f7b90542 100755 --- a/applications/osmo/deploy/example/002-setup/07-configure-service-url.sh +++ b/applications/osmo/deploy/example/002-setup/07-configure-service-url.sh @@ -6,7 +6,7 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" diff --git a/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh b/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh index c0f6775ab..56361b742 100755 --- a/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh +++ b/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh @@ -4,7 +4,7 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" diff --git a/applications/osmo/deploy/example/002-setup/09-configure-backend-scheduler.sh b/applications/osmo/deploy/example/002-setup/09-configure-backend-scheduler.sh index 27698e57b..e47a6773e 100755 --- a/applications/osmo/deploy/example/002-setup/09-configure-backend-scheduler.sh +++ b/applications/osmo/deploy/example/002-setup/09-configure-backend-scheduler.sh @@ -6,7 +6,7 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" CONFIG_DIR="${SCRIPT_DIR}/config" source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" diff --git a/applications/osmo/deploy/example/002-setup/10-configure-dataset-bucket.sh b/applications/osmo/deploy/example/002-setup/10-configure-dataset-bucket.sh index 5db244d22..d8f41e1f5 100755 --- a/applications/osmo/deploy/example/002-setup/10-configure-dataset-bucket.sh +++ b/applications/osmo/deploy/example/002-setup/10-configure-dataset-bucket.sh @@ -55,11 +55,7 @@ else printf "Enter choice [1-${#VALID_REGIONS[@]}]: " read -r choice if [[ "$choice" =~ ^[0-9]+$ ]] && (( choice >= 1 && choice <= ${#VALID_REGIONS[@]} )); then - REGION="${VALID_REGIONS[$choice]}" - # bash arrays are 0-based, zsh arrays are 1-based; adjust if needed - if [[ -z "$REGION" ]]; then - REGION="${VALID_REGIONS[$((choice - 1))]}" - fi + REGION="${VALID_REGIONS[$((choice - 1))]}" break fi echo "Invalid selection. Please enter a number between 1 and ${#VALID_REGIONS[@]}." diff --git a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-gpu-infrastructure.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-gpu-infrastructure.sh index de869a0cf..656c9f6d2 100755 --- a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-gpu-infrastructure.sh +++ b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-gpu-infrastructure.sh @@ -5,7 +5,7 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" source "${SCRIPT_DIR}/../lib/common.sh" source "${SCRIPT_DIR}/../defaults.sh" diff --git a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-keycloak.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-keycloak.sh index caeaa8b74..9a9b14170 100755 --- a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-keycloak.sh +++ b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-keycloak.sh @@ -3,7 +3,7 @@ # This removes Keycloak and related secrets. After running this, re-deploy # OSMO control plane without DEPLOY_KEYCLOAK to switch back to open API mode. set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")/.." && pwd)" source "${SCRIPT_DIR}/lib/common.sh" source "${SCRIPT_DIR}/defaults.sh" diff --git a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-nginx-ingress.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-nginx-ingress.sh index 471029d5c..9b22947bf 100755 --- a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-nginx-ingress.sh +++ b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-nginx-ingress.sh @@ -1,7 +1,7 @@ #!/bin/bash # Uninstall NGINX Ingress Controller (deployed by 03-deploy-nginx-ingress.sh) set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")/.." && pwd)" source "${SCRIPT_DIR}/lib/common.sh" INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}" INGRESS_RELEASE_NAME="${INGRESS_RELEASE_NAME:-ingress-nginx}" diff --git a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-observability.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-observability.sh index e847de5a6..bbafe9007 100755 --- a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-observability.sh +++ b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-observability.sh @@ -5,7 +5,7 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" source "${SCRIPT_DIR}/../lib/common.sh" source "${SCRIPT_DIR}/../defaults.sh" diff --git a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-backend.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-backend.sh index cce604c99..dba6bc817 100755 --- a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-backend.sh +++ b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-backend.sh @@ -6,7 +6,7 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" source "${SCRIPT_DIR}/../lib/common.sh" source "${SCRIPT_DIR}/../defaults.sh" diff --git a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-control-plane.sh b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-control-plane.sh index 0abb5f560..e19f183bf 100755 --- a/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-control-plane.sh +++ b/applications/osmo/deploy/example/002-setup/cleanup/uninstall-osmo-control-plane.sh @@ -5,7 +5,7 @@ set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" source "${SCRIPT_DIR}/../lib/common.sh" source "${SCRIPT_DIR}/../defaults.sh" From 8af8156293abeac11737908b1e6e31bd54cb13cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Thu, 26 Feb 2026 14:21:29 +0100 Subject: [PATCH 28/37] - several bug fixes --- .../example/002-setup/06-configure-storage.sh | 10 ++++- .../002-setup/07-configure-service-url.sh | 7 ++++ .../002-setup/08-configure-gpu-platform.sh | 13 ++++++- .../002-setup/10-configure-dataset-bucket.sh | 39 +++---------------- .../example/002-setup/gpu_pod_template.json | 18 ++++++--- 5 files changed, 46 insertions(+), 41 deletions(-) diff --git a/applications/osmo/deploy/example/002-setup/06-configure-storage.sh b/applications/osmo/deploy/example/002-setup/06-configure-storage.sh index f62352ca3..baf822b14 100755 --- a/applications/osmo/deploy/example/002-setup/06-configure-storage.sh +++ b/applications/osmo/deploy/example/002-setup/06-configure-storage.sh @@ -27,9 +27,15 @@ log_info "Retrieving storage configuration from Terraform..." S3_BUCKET=$(get_tf_output "storage_bucket.name" "../001-iac" 2>/dev/null || echo "") S3_ENDPOINT=$(get_tf_output "storage_bucket.endpoint" "../001-iac" 2>/dev/null || echo "") +# Require NEBIUS_REGION (set by nebius-env-init.sh) +if [[ -z "${NEBIUS_REGION:-}" ]]; then + log_error "NEBIUS_REGION is not set. Run 'source ../000-prerequisites/nebius-env-init.sh' first." + exit 1 +fi + # Default endpoint if not set if [[ -z "$S3_ENDPOINT" ]]; then - S3_ENDPOINT="https://storage.eu-north1.nebius.cloud" + S3_ENDPOINT="https://storage.${NEBIUS_REGION}.nebius.cloud" fi if [[ -z "$S3_BUCKET" ]]; then @@ -142,7 +148,7 @@ fi # Format: tos:/// S3_HOST=$(echo "$S3_ENDPOINT" | sed 's|https://||') BACKEND_URI="tos://${S3_HOST}/${S3_BUCKET}" -REGION="eu-north1" +REGION="${NEBIUS_REGION}" log_success "Storage credentials retrieved" diff --git a/applications/osmo/deploy/example/002-setup/07-configure-service-url.sh b/applications/osmo/deploy/example/002-setup/07-configure-service-url.sh index 4f7b90542..d0a291731 100755 --- a/applications/osmo/deploy/example/002-setup/07-configure-service-url.sh +++ b/applications/osmo/deploy/example/002-setup/07-configure-service-url.sh @@ -19,6 +19,13 @@ echo "" # Check prerequisites check_kubectl || exit 1 +if [[ -z "${OSMO_INGRESS_HOSTNAME:-}" ]]; then + log_error "OSMO_INGRESS_HOSTNAME is not set." + echo " Source your environment first: source ../000-prerequisites/nebius-env-init.sh" + echo " Or set it manually: export OSMO_INGRESS_HOSTNAME=" + exit 1 +fi + # ----------------------------------------------------------------------------- # Start port-forward # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh b/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh index 56361b742..7eb6f4c3f 100755 --- a/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh +++ b/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh @@ -11,6 +11,12 @@ source "${SCRIPT_DIR}/defaults.sh" OSMO_URL="${OSMO_URL:-http://localhost:8080}" OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" +# Require NEBIUS_REGION (set by nebius-env-init.sh) +if [[ -z "${NEBIUS_REGION:-}" ]]; then + echo "ERROR: NEBIUS_REGION is not set. Run 'source ../000-prerequisites/nebius-env-init.sh' first." + exit 1 +fi + echo "" echo "========================================" echo " OSMO GPU Platform Configuration" @@ -54,9 +60,14 @@ log_success "Port-forward ready" # ----------------------------------------------------------------------------- log_info "Creating gpu_tolerations pod template..." +# Substitute {{NEBIUS_REGION}} placeholder in the template +GPU_POD_TEMPLATE_RESOLVED="/tmp/gpu_pod_template_resolved.json" +sed "s/{{NEBIUS_REGION}}/${NEBIUS_REGION}/g" "${SCRIPT_DIR}/gpu_pod_template.json" > "${GPU_POD_TEMPLATE_RESOLVED}" + RESPONSE=$(osmo_curl PUT "${OSMO_URL}/api/configs/pod_template/gpu_tolerations" \ -w "\n%{http_code}" \ - -d @"${SCRIPT_DIR}/gpu_pod_template.json") + -d @"${GPU_POD_TEMPLATE_RESOLVED}") +rm -f "${GPU_POD_TEMPLATE_RESOLVED}" HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') diff --git a/applications/osmo/deploy/example/002-setup/10-configure-dataset-bucket.sh b/applications/osmo/deploy/example/002-setup/10-configure-dataset-bucket.sh index d8f41e1f5..bf676e073 100755 --- a/applications/osmo/deploy/example/002-setup/10-configure-dataset-bucket.sh +++ b/applications/osmo/deploy/example/002-setup/10-configure-dataset-bucket.sh @@ -27,41 +27,14 @@ DATASET_BUCKET_NAME="${DATASET_BUCKET_NAME:-nebius}" check_kubectl || exit 1 # ----------------------------------------------------------------------------- -# Select Nebius Region +# Nebius Region (from nebius-env-init.sh) # ----------------------------------------------------------------------------- -VALID_REGIONS=("eu-north1" "me-west1") - -if [[ -n "${NEBIUS_REGION:-}" ]]; then - REGION="$NEBIUS_REGION" - matched=false - for r in "${VALID_REGIONS[@]}"; do - [[ "$r" == "$REGION" ]] && matched=true && break - done - if ! $matched; then - log_error "Invalid NEBIUS_REGION '${REGION}'. Valid options: ${VALID_REGIONS[*]}" - exit 1 - fi - log_info "Using region from NEBIUS_REGION: ${REGION}" -else - echo "Select the Nebius region for the storage bucket:" - echo "" - _idx=1 - for _r in "${VALID_REGIONS[@]}"; do - echo " ${_idx}) ${_r}" - _idx=$((_idx + 1)) - done - echo "" - while true; do - printf "Enter choice [1-${#VALID_REGIONS[@]}]: " - read -r choice - if [[ "$choice" =~ ^[0-9]+$ ]] && (( choice >= 1 && choice <= ${#VALID_REGIONS[@]} )); then - REGION="${VALID_REGIONS[$((choice - 1))]}" - break - fi - echo "Invalid selection. Please enter a number between 1 and ${#VALID_REGIONS[@]}." - done - log_info "Selected region: ${REGION}" +if [[ -z "${NEBIUS_REGION:-}" ]]; then + log_error "NEBIUS_REGION is not set. Run 'source ../000-prerequisites/nebius-env-init.sh' first." + exit 1 fi +REGION="$NEBIUS_REGION" +log_info "Using region: ${REGION}" S3_REGION_FOR_BOTO="${REGION}" diff --git a/applications/osmo/deploy/example/002-setup/gpu_pod_template.json b/applications/osmo/deploy/example/002-setup/gpu_pod_template.json index 107207028..d704f64fc 100755 --- a/applications/osmo/deploy/example/002-setup/gpu_pod_template.json +++ b/applications/osmo/deploy/example/002-setup/gpu_pod_template.json @@ -14,7 +14,7 @@ "env": [ { "name": "AWS_ENDPOINT_URL_S3", - "value": "https://storage.me-west1.nebius.cloud:443" + "value": "https://storage.{{NEBIUS_REGION}}.nebius.cloud:443" }, { "name": "AWS_S3_FORCE_PATH_STYLE", @@ -22,7 +22,7 @@ }, { "name": "AWS_DEFAULT_REGION", - "value": "us-east-1" + "value": "{{NEBIUS_REGION}}" }, { "name": "OSMO_LOGIN_DEV", @@ -32,14 +32,22 @@ "name": "OSMO_SKIP_DATA_AUTH", "value": "1" } - ] + ], + "resources": { + "limits": { + "nvidia.com/gpu": "{{USER_GPU}}" + }, + "requests": { + "nvidia.com/gpu": "{{USER_GPU}}" + } + } }, { "name": "osmo-ctrl", "env": [ { "name": "AWS_ENDPOINT_URL_S3", - "value": "https://storage.me-west1.nebius.cloud:443" + "value": "https://storage.{{NEBIUS_REGION}}.nebius.cloud:443" }, { "name": "AWS_S3_FORCE_PATH_STYLE", @@ -47,7 +55,7 @@ }, { "name": "AWS_DEFAULT_REGION", - "value": "us-east-1" + "value": "{{NEBIUS_REGION}}" }, { "name": "OSMO_LOGIN_DEV", From 34510bd4e702070768fe7bdd0f4a547e348927cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Thu, 26 Feb 2026 20:11:19 +0100 Subject: [PATCH 29/37] - add shared filesystem patches --- .../000-prerequisites/nebius-env-init.sh | 3 + .../001-iac/terraform.tfvars.reserved.example | 100 ++++++++++++++++++ .../deploy/example/002-setup/04-enable-tls.sh | 6 +- .../002-setup/gpu_platform_update.json | 2 +- .../osmo/workflows/osmo/test_shared_fs.yaml | 54 ++++++++++ 5 files changed, 161 insertions(+), 4 deletions(-) create mode 100644 applications/osmo/deploy/example/001-iac/terraform.tfvars.reserved.example create mode 100644 applications/osmo/workflows/osmo/test_shared_fs.yaml diff --git a/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh b/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh index cd52bc201..8e0b9e8fc 100755 --- a/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh +++ b/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh @@ -16,6 +16,9 @@ NEBIUS_TENANT_ID="${NEBIUS_TENANT_ID:-}" # e.g. tenant-abc123def456 NEBIUS_PROJECT_ID="${NEBIUS_PROJECT_ID:-}" # e.g. project-abc123def456 NEBIUS_REGION="${NEBIUS_REGION:-eu-north1}" # eu-north1, eu-north2, eu-west1, me-west1, uk-south1, us-central1 + +export OSMO_INGRESS_HOSTNAME="${OSMO_INGRESS_HOSTNAME:-.eu-north1.osmo.nebius.cloud}" +export KEYCLOAK_HOSTNAME="${KEYCLOAK_HOSTNAME:-auth.${OSMO_INGRESS_HOSTNAME}}" # ======================================== # Colors diff --git a/applications/osmo/deploy/example/001-iac/terraform.tfvars.reserved.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.reserved.example new file mode 100644 index 000000000..1da5a72ee --- /dev/null +++ b/applications/osmo/deploy/example/001-iac/terraform.tfvars.reserved.example @@ -0,0 +1,100 @@ +# ============================================================================= +# OSMO on Nebius - Reserved GPU Configuration +# ============================================================================= +# This configuration uses reserved GPU capacity (Capacity Block Groups). +# Reserved instances guarantee availability and are billed at a fixed rate. +# +# Prerequisites: +# - A Capacity Block Group must be created in the Nebius Console +# - Set the capacity block group ID below +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Required Settings (set via environment or uncomment) +# ----------------------------------------------------------------------------- +# Run: source ../000-prerequisites/nebius-env-init.sh +# This will set TF_VAR_tenant_id and TF_VAR_parent_id automatically +# +# tenant_id = "your-tenant-id" +# parent_id = "your-project-id" + +# ----------------------------------------------------------------------------- +# Environment Settings +# ----------------------------------------------------------------------------- +region = "eu-north1" +environment = "pro" +project_name = "osmo" + +# ----------------------------------------------------------------------------- +# Kubernetes Cluster +# ----------------------------------------------------------------------------- +k8s_version = null +etcd_cluster_size = 3 +enable_public_endpoint = false # Private API - access via WireGuard + +# ----------------------------------------------------------------------------- +# CPU Nodes +# ----------------------------------------------------------------------------- +cpu_nodes_count = 3 +cpu_nodes_platform = "cpu-d3" +cpu_nodes_preset = "16vcpu-64gb" +cpu_disk_size_gib = 128 +cpu_nodes_assign_public_ip = false + +# ----------------------------------------------------------------------------- +# GPU Nodes (RESERVED) +# ----------------------------------------------------------------------------- +gpu_nodes_count_per_group = 2 # Must match your reservation size +gpu_node_groups = 1 +gpu_nodes_platform = "gpu-h100-sxm" +gpu_nodes_preset = "8gpu-128vcpu-1600gb" +gpu_disk_size_gib = 1023 +gpu_nodes_assign_public_ip = false +enable_gpu_cluster = true # InfiniBand for multi-node training +enable_gpu_taints = true +gpu_nodes_preemptible = false # Reserved nodes are not preemptible +gpu_nodes_driverfull_image = false +infiniband_fabric = null # Use region default + +# RESERVATION: Set your Capacity Block Group ID here +gpu_reservation_ids = ["capacityblockgroup-e00xxxxx"] + +# GPU reservation options by region: +# eu-north1: gpu-h100-sxm (8gpu-128vcpu-1600gb), gpu-h200-sxm (8gpu-128vcpu-1600gb) +# eu-north2: gpu-h200-sxm (8gpu-128vcpu-1600gb) +# eu-west1: gpu-h200-sxm (8gpu-128vcpu-1600gb) +# me-west1: gpu-b200-sxm-a (8gpu-160vcpu-1792gb) +# uk-south1: gpu-b300-sxm (8gpu-192vcpu-2768gb) +# us-central1: gpu-h200-sxm (8gpu-128vcpu-1600gb), gpu-b200-sxm (8gpu-160vcpu-1792gb) + +# ----------------------------------------------------------------------------- +# Storage +# ----------------------------------------------------------------------------- +enable_filestore = true +filestore_size_gib = 1024 + +# ----------------------------------------------------------------------------- +# PostgreSQL (Nebius Managed Service) +# ----------------------------------------------------------------------------- +postgresql_preset = "4vcpu-16gb" +postgresql_disk_size_gib = 50 +postgresql_host_count = 1 + +# ----------------------------------------------------------------------------- +# WireGuard VPN (recommended for private clusters) +# ----------------------------------------------------------------------------- +enable_wireguard = true +wireguard_platform = "cpu-d3" +wireguard_preset = "2vcpu-8gb" +wireguard_disk_size_gib = 32 +wireguard_port = 51820 +wireguard_network = "10.8.0.0/24" +wireguard_ui_port = 5000 + +# ============================================================================= +# After deployment: +# 1. Set up WireGuard client: cd ../000-prerequisites && ./wireguard-client-setup.sh +# 2. Connect to VPN +# 3. Get kubectl credentials: nebius mk8s cluster get-credentials --id +# 4. Access cluster via private endpoint +# ============================================================================= diff --git a/applications/osmo/deploy/example/002-setup/04-enable-tls.sh b/applications/osmo/deploy/example/002-setup/04-enable-tls.sh index f1a569a27..bb9ae2a96 100755 --- a/applications/osmo/deploy/example/002-setup/04-enable-tls.sh +++ b/applications/osmo/deploy/example/002-setup/04-enable-tls.sh @@ -441,10 +441,10 @@ EOF log_info "Waiting for certificate '${cert_name}' (up to ${max_wait}s)..." while [[ $elapsed -lt $max_wait ]]; do - local status - status=$(kubectl get certificate "${cert_name}" -n "${OSMO_NS}" \ + local cert_status + cert_status=$(kubectl get certificate "${cert_name}" -n "${OSMO_NS}" \ -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "") - if [[ "$status" == "True" ]]; then + if [[ "$cert_status" == "True" ]]; then log_success "Certificate '${cert_name}' issued and ready" return 0 fi diff --git a/applications/osmo/deploy/example/002-setup/gpu_platform_update.json b/applications/osmo/deploy/example/002-setup/gpu_platform_update.json index 56c0764fe..da10c6a05 100755 --- a/applications/osmo/deploy/example/002-setup/gpu_platform_update.json +++ b/applications/osmo/deploy/example/002-setup/gpu_platform_update.json @@ -3,7 +3,7 @@ "description": "GPU platform for L40S nodes", "host_network_allowed": false, "privileged_allowed": false, - "allowed_mounts": [], + "allowed_mounts": ["/mnt/data"], "default_mounts": [], "default_variables": { "USER_GPU": 1 diff --git a/applications/osmo/workflows/osmo/test_shared_fs.yaml b/applications/osmo/workflows/osmo/test_shared_fs.yaml new file mode 100644 index 000000000..1de8fea84 --- /dev/null +++ b/applications/osmo/workflows/osmo/test_shared_fs.yaml @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# Shared Filesystem Test Workflow +# Validates that the Nebius Filestore mounted at /mnt/data is accessible from OSMO workflows. +# Requires: enable_filestore=true in terraform.tfvars and allowed_mounts configured on the gpu platform. +# +# Submit with: +# osmo workflow submit workflows/osmo/test_shared_fs.yaml + +workflow: + name: test-shared-fs + resources: + gpu-resource: + platform: gpu + gpu: 1 + cpu: 2 + memory: 2Gi + storage: 1Gi + tasks: + - name: test-filestore + image: ubuntu:24.04 + command: ["bash", "-c"] + args: + - | + echo "=== Shared Filesystem Test ===" + echo "" + echo "=== Mount info ===" + df -h /mnt/data + echo "" + echo "=== Writing test file ===" + echo "Hello from OSMO workflow at $(date -Iseconds)" > /mnt/data/osmo-test.txt + cat /mnt/data/osmo-test.txt + echo "" + echo "=== Directory listing ===" + ls -la /mnt/data/ + echo "" + echo "=== Shared Filesystem Test Complete ===" + resource: gpu-resource + volumeMounts: + - /mnt/data From e8662203eed27f2a9e5a0a0ceeb0c5bd8205a5d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Thu, 26 Feb 2026 21:15:19 +0100 Subject: [PATCH 30/37] - fix default user - add keycload password check --- .../000-prerequisites/nebius-env-init.sh | 10 ++-- .../002-setup/08-configure-gpu-platform.sh | 36 ++++++++++++-- .../002-setup/12-show-keycloak-credentials.sh | 47 +++++++++++++++++++ .../002-setup/default_user_pod_template.json | 24 ++++++++++ 4 files changed, 110 insertions(+), 7 deletions(-) create mode 100755 applications/osmo/deploy/example/002-setup/12-show-keycloak-credentials.sh create mode 100644 applications/osmo/deploy/example/002-setup/default_user_pod_template.json diff --git a/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh b/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh index 8e0b9e8fc..850dd7e18 100755 --- a/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh +++ b/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh @@ -17,8 +17,8 @@ NEBIUS_TENANT_ID="${NEBIUS_TENANT_ID:-}" # e.g. tenant-abc123def456 NEBIUS_PROJECT_ID="${NEBIUS_PROJECT_ID:-}" # e.g. project-abc123def456 NEBIUS_REGION="${NEBIUS_REGION:-eu-north1}" # eu-north1, eu-north2, eu-west1, me-west1, uk-south1, us-central1 -export OSMO_INGRESS_HOSTNAME="${OSMO_INGRESS_HOSTNAME:-.eu-north1.osmo.nebius.cloud}" -export KEYCLOAK_HOSTNAME="${KEYCLOAK_HOSTNAME:-auth.${OSMO_INGRESS_HOSTNAME}}" +OSMO_INGRESS_HOSTNAME="${OSMO_INGRESS_HOSTNAME:-}" # e.g. myapp.eu-north1.osmo.nebius.cloud +KEYCLOAK_HOSTNAME="${KEYCLOAK_HOSTNAME:-}" # e.g. auth.myapp.eu-north1.osmo.nebius.cloud # ======================================== # Colors @@ -166,6 +166,8 @@ main() { export NEBIUS_TENANT_ID export NEBIUS_PROJECT_ID export NEBIUS_REGION + export OSMO_INGRESS_HOSTNAME + export KEYCLOAK_HOSTNAME # Get IAM token for Terraform provider authentication echo "Getting IAM token for Terraform..." @@ -248,7 +250,9 @@ main() { echo " NEBIUS_REGION = $NEBIUS_REGION" echo " NEBIUS_IAM_TOKEN = ${NEBIUS_IAM_TOKEN:0:20}... (truncated)" echo " NEBIUS_NETWORK_ID = $NEBIUS_NETWORK_ID" - echo " NEBIUS_SUBNET_ID = $NEBIUS_SUBNET_ID" + echo " NEBIUS_SUBNET_ID = $NEBIUS_SUBNET_ID" + echo " OSMO_INGRESS_HOSTNAME = $OSMO_INGRESS_HOSTNAME" + echo " KEYCLOAK_HOSTNAME = $KEYCLOAK_HOSTNAME" echo "" echo " Network: $network_name ($network_id)" echo " Subnet: $subnet_name ($subnet_id)" diff --git a/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh b/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh index 7eb6f4c3f..b0247673a 100755 --- a/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh +++ b/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh @@ -56,7 +56,29 @@ done log_success "Port-forward ready" # ----------------------------------------------------------------------------- -# Step 1: Create GPU pod template +# Step 1: Fix default_user pod template (remove GPU resources) +# ----------------------------------------------------------------------------- +# The built-in default_user template includes nvidia.com/gpu which causes ALL +# workflows (including CPU-only) to request the nvidia RuntimeClass. This fails +# on CPU nodes. We move GPU resources to the gpu_tolerations template instead. +log_info "Updating default_user pod template (removing GPU resources)..." + +RESPONSE=$(osmo_curl PUT "${OSMO_URL}/api/configs/pod_template/default_user" \ + -w "\n%{http_code}" \ + -d @"${SCRIPT_DIR}/default_user_pod_template.json") +HTTP_CODE=$(echo "$RESPONSE" | tail -n1) +BODY=$(echo "$RESPONSE" | sed '$d') + +if [[ "$HTTP_CODE" -ge 200 && "$HTTP_CODE" -lt 300 ]]; then + log_success "default_user pod template updated (HTTP ${HTTP_CODE})" +else + log_error "Failed to update default_user pod template (HTTP ${HTTP_CODE})" + echo "Response: ${BODY}" + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Step 2: Create GPU pod template # ----------------------------------------------------------------------------- log_info "Creating gpu_tolerations pod template..." @@ -80,7 +102,7 @@ else fi # ----------------------------------------------------------------------------- -# Step 2: Create GPU platform +# Step 3: Create GPU platform # ----------------------------------------------------------------------------- log_info "Creating gpu platform in default pool..." @@ -99,7 +121,7 @@ else fi # ----------------------------------------------------------------------------- -# Step 3: Verify configuration +# Step 4: Verify configuration # ----------------------------------------------------------------------------- log_info "Verifying configuration..." @@ -112,7 +134,7 @@ echo "GPU platform config:" osmo_curl GET "${OSMO_URL}/api/configs/pool/default" | jq '.platforms.gpu' # ----------------------------------------------------------------------------- -# Step 4: Check GPU resources +# Step 5: Check GPU resources # ----------------------------------------------------------------------------- log_info "Checking GPU resources..." sleep 3 # Wait for backend to pick up changes @@ -127,6 +149,12 @@ if [[ "$RESOURCE_COUNT" -gt 0 ]]; then echo "$RESOURCE_JSON" | jq '.resources[] | select(.allocatable_fields.gpu != null) | {name: .name, gpu: .allocatable_fields.gpu, cpu: .allocatable_fields.cpu, memory: .allocatable_fields.memory}' fi +# ----------------------------------------------------------------------------- +# Step 6: Set default pool profile +# ----------------------------------------------------------------------------- +log_info "Setting default pool to 'default'..." +osmo profile set pool default 2>/dev/null && log_success "Default pool set" || log_warning "Could not set default pool (set manually: osmo profile set pool default)" + # ----------------------------------------------------------------------------- # Done # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/example/002-setup/12-show-keycloak-credentials.sh b/applications/osmo/deploy/example/002-setup/12-show-keycloak-credentials.sh new file mode 100755 index 000000000..27716526d --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/12-show-keycloak-credentials.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Show Keycloak admin credentials +# Retrieves the admin password from the keycloak-admin-secret Kubernetes secret. + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" + +echo "" +echo "========================================" +echo " Keycloak Admin Credentials" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 + +# Retrieve admin password from Kubernetes secret +log_info "Retrieving Keycloak admin password..." + +ADMIN_PASSWORD=$(kubectl get secret keycloak-admin-secret -n "${OSMO_NAMESPACE}" \ + -o jsonpath='{.data.password}' 2>/dev/null | base64 -d 2>/dev/null) || true + +if [[ -z "${ADMIN_PASSWORD}" ]]; then + log_error "Could not retrieve Keycloak admin password from secret 'keycloak-admin-secret' in namespace '${OSMO_NAMESPACE}'." + echo " Make sure Keycloak has been deployed (04-deploy-osmo-control-plane.sh)." + exit 1 +fi + +# Determine Keycloak URL +if [[ -n "${KEYCLOAK_HOSTNAME:-}" ]]; then + KEYCLOAK_URL="https://${KEYCLOAK_HOSTNAME}" +elif [[ -n "${OSMO_INGRESS_HOSTNAME:-}" ]]; then + KEYCLOAK_URL="https://auth-${OSMO_INGRESS_HOSTNAME}" +else + KEYCLOAK_URL="(unknown — set KEYCLOAK_HOSTNAME or OSMO_INGRESS_HOSTNAME)" +fi + +echo "" +echo " URL: ${KEYCLOAK_URL}" +echo " Username: admin" +echo " Password: ${ADMIN_PASSWORD}" +echo "" diff --git a/applications/osmo/deploy/example/002-setup/default_user_pod_template.json b/applications/osmo/deploy/example/002-setup/default_user_pod_template.json new file mode 100644 index 000000000..71eed214b --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/default_user_pod_template.json @@ -0,0 +1,24 @@ +{ + "configs": { + "spec": { + "containers": [ + { + "name": "{{USER_CONTAINER_NAME}}", + "resources": { + "limits": { + "cpu": "{{USER_CPU}}", + "memory": "{{USER_MEMORY}}", + "ephemeral-storage": "{{USER_STORAGE}}" + }, + "requests": { + "cpu": "{{USER_CPU}}", + "memory": "{{USER_MEMORY}}", + "ephemeral-storage": "{{USER_STORAGE}}" + } + } + } + ] + } + }, + "description": "Default user container template (GPU resources moved to gpu_tolerations template)" +} From b4bbfec0246d2b7b07aa68a3e3d1f0f84b36ebf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Thu, 5 Mar 2026 16:52:02 +0100 Subject: [PATCH 31/37] - many bug fixes --- .../modules/k8s/templates/cloud-init.yaml | 8 +- .../osmo/deploy/example/001-iac/outputs.tf | 5 + .../example/001-iac/terraform.tfvars.example | 2 +- .../osmo/deploy/example/001-iac/variables.tf | 2 +- .../002-setup/01-deploy-gpu-infrastructure.sh | 7 + .../002-setup/04-deploy-osmo-control-plane.sh | 15 +- .../example/002-setup/06-configure-storage.sh | 22 + .../002-setup/08-configure-gpu-platform.sh | 57 ++- .../002-setup/10-verify-installation.sh | 426 ++++++++++++++++++ .../11-connect-remote-control-plane.sh | 190 ++++++++ .../002-setup/gpu_platform_update.json | 4 +- 11 files changed, 719 insertions(+), 19 deletions(-) create mode 100755 applications/osmo/deploy/example/002-setup/10-verify-installation.sh create mode 100755 applications/osmo/deploy/example/002-setup/11-connect-remote-control-plane.sh diff --git a/applications/osmo/deploy/example/001-iac/modules/k8s/templates/cloud-init.yaml b/applications/osmo/deploy/example/001-iac/modules/k8s/templates/cloud-init.yaml index 096cf18b0..66ba3fa2a 100755 --- a/applications/osmo/deploy/example/001-iac/modules/k8s/templates/cloud-init.yaml +++ b/applications/osmo/deploy/example/001-iac/modules/k8s/templates/cloud-init.yaml @@ -16,11 +16,13 @@ packages: %{ if enable_filestore ~} runcmd: - # Mount filestore if attached + # Mount filestore if attached (virtiofs is not a block device, check sysfs tags) - | - if [ -b /dev/disk/by-id/virtio-data ]; then + if grep -qs '^data$' /sys/fs/virtiofs/*/tag 2>/dev/null; then mkdir -p /mnt/data mount -t virtiofs data /mnt/data || true - echo "data /mnt/data virtiofs defaults 0 0" >> /etc/fstab + if ! grep -qs 'virtiofs.*mnt/data' /etc/fstab; then + echo "data /mnt/data virtiofs defaults 0 0" >> /etc/fstab + fi fi %{ endif ~} diff --git a/applications/osmo/deploy/example/001-iac/outputs.tf b/applications/osmo/deploy/example/001-iac/outputs.tf index 467feaad0..ae983a430 100755 --- a/applications/osmo/deploy/example/001-iac/outputs.tf +++ b/applications/osmo/deploy/example/001-iac/outputs.tf @@ -140,6 +140,11 @@ output "gpu_nodes_driverfull_image" { value = var.gpu_nodes_driverfull_image } +output "gpu_nodes_platform" { + description = "GPU platform type (e.g. gpu-h100-sxm, gpu-h200-sxm)" + value = var.gpu_nodes_platform +} + # ----------------------------------------------------------------------------- # Connection Instructions # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/example/001-iac/terraform.tfvars.example b/applications/osmo/deploy/example/001-iac/terraform.tfvars.example index 89e7b63ea..aa12368d1 100755 --- a/applications/osmo/deploy/example/001-iac/terraform.tfvars.example +++ b/applications/osmo/deploy/example/001-iac/terraform.tfvars.example @@ -60,7 +60,7 @@ filestore_size_gib = 1024 # ----------------------------------------------------------------------------- # Platform depends on region: cpu-e2 (eu-north1), cpu-d3 (all other regions) # postgresql_platform = null # Auto: cpu-e2 (eu-north1), cpu-d3 (other regions) -postgresql_preset = "2vcpu-8gb" # Available presets vary by region +postgresql_preset = "4vcpu-16gb" # Available presets vary by region # postgresql_disk_type = null # Auto: network-ssd (eu-north1), nbs-csi-sc (other regions) postgresql_disk_size_gib = 50 postgresql_host_count = 1 diff --git a/applications/osmo/deploy/example/001-iac/variables.tf b/applications/osmo/deploy/example/001-iac/variables.tf index 6a16845e2..3557341fb 100755 --- a/applications/osmo/deploy/example/001-iac/variables.tf +++ b/applications/osmo/deploy/example/001-iac/variables.tf @@ -311,7 +311,7 @@ variable "postgresql_platform" { variable "postgresql_preset" { description = "PostgreSQL resource preset (2vcpu-8gb is minimum)" type = string - default = "2vcpu-8gb" + default = "4vcpu-16gb" } variable "postgresql_disk_type" { diff --git a/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh b/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh index 7e5d19679..e6a8b0578 100755 --- a/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh +++ b/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh @@ -56,9 +56,16 @@ else kubectl create namespace "${GPU_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + DRIVER_VERSION_ARGS=() + if [[ -n "${GPU_DRIVER_VERSION:-}" ]]; then + log_info "Using pinned driver version: ${GPU_DRIVER_VERSION}" + DRIVER_VERSION_ARGS=(--set "driver.version=${GPU_DRIVER_VERSION}") + fi + helm upgrade --install gpu-operator nvidia/gpu-operator \ --namespace "${GPU_OPERATOR_NAMESPACE}" \ --values "${VALUES_DIR}/gpu-operator.yaml" \ + "${DRIVER_VERSION_ARGS[@]+"${DRIVER_VERSION_ARGS[@]}"}" \ --timeout 10m log_success "GPU Operator deployed (pods will become ready when GPU nodes are available)" diff --git a/applications/osmo/deploy/example/002-setup/04-deploy-osmo-control-plane.sh b/applications/osmo/deploy/example/002-setup/04-deploy-osmo-control-plane.sh index 2820377a7..1052b9db9 100755 --- a/applications/osmo/deploy/example/002-setup/04-deploy-osmo-control-plane.sh +++ b/applications/osmo/deploy/example/002-setup/04-deploy-osmo-control-plane.sh @@ -490,12 +490,17 @@ if kubectl get statefulset redis-master -n "${OSMO_NAMESPACE}" &>/dev/null; then else helm upgrade --install redis bitnami/redis \ --namespace "${OSMO_NAMESPACE}" \ + --version 25.3.1 \ --set architecture=standalone \ --set auth.enabled=false \ - --set master.persistence.size=1Gi \ - --set master.resources.requests.cpu=100m \ - --set master.resources.requests.memory=128Mi \ - --wait --timeout 5m + --set networkPolicy.enabled=false \ + --set master.persistence.size=50Gi \ + --set master.resources.requests.cpu=8 \ + --set master.resources.requests.memory=52820Mi \ + --set master.resources.limits.cpu=8 \ + --set master.resources.limits.memory=52820Mi \ + --set commonConfiguration="aof-load-corrupt-tail-max-size 10000000" \ + --wait --timeout 10m log_success "Redis deployed" fi @@ -674,7 +679,7 @@ fi) resources: requests: cpu: "500m" - memory: "512Mi" + memory: "768Mi" limits: cpu: "2" memory: "1Gi" diff --git a/applications/osmo/deploy/example/002-setup/06-configure-storage.sh b/applications/osmo/deploy/example/002-setup/06-configure-storage.sh index baf822b14..c4ef993e5 100755 --- a/applications/osmo/deploy/example/002-setup/06-configure-storage.sh +++ b/applications/osmo/deploy/example/002-setup/06-configure-storage.sh @@ -217,6 +217,28 @@ fi # Cleanup temp files rm -f /tmp/workflow_log_config.json /tmp/workflow_data_config.json +# ----------------------------------------------------------------------------- +# Configure Workflow Limits +# ----------------------------------------------------------------------------- +log_info "Configuring workflow limits (max_num_tasks=200)..." + +WORKFLOW_LIMITS_CONFIG=$(cat < /tmp/workflow_limits_config.json + +if osmo_config_update WORKFLOW /tmp/workflow_limits_config.json "Configure workflow limits"; then + log_success "Workflow limits configured (max_num_tasks=200)" +else + log_warning "Failed to configure workflow limits (may require newer OSMO version)" +fi + +rm -f /tmp/workflow_limits_config.json + # ----------------------------------------------------------------------------- # Verify Configuration # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh b/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh index b0247673a..e98fca791 100755 --- a/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh +++ b/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh @@ -17,9 +17,33 @@ if [[ -z "${NEBIUS_REGION:-}" ]]; then exit 1 fi +# ----------------------------------------------------------------------------- +# Determine GPU platform name +# ----------------------------------------------------------------------------- +# Try to read the GPU platform from Terraform output and derive a friendly name. +# Maps: gpu-h100-sxm -> H100, gpu-h200-sxm -> H200, gpu-b200-sxm-a -> B200, etc. +# Falls back to user input if Terraform is unavailable. +if [[ -z "${GPU_PLATFORM_NAME:-}" ]]; then + TF_GPU_PLATFORM=$(get_tf_output "gpu_nodes_platform" "../001-iac" 2>/dev/null || echo "") + if [[ -n "$TF_GPU_PLATFORM" ]]; then + # Extract friendly name: gpu-h100-sxm -> H100, gpu-b200-sxm-a -> B200, gpu-l40s-a -> L40S + GPU_PLATFORM_NAME=$(echo "$TF_GPU_PLATFORM" | sed -E 's/^gpu-([a-zA-Z0-9]+).*/\1/' | tr '[:lower:]' '[:upper:]') + log_info "Auto-detected GPU platform from Terraform: ${TF_GPU_PLATFORM} -> ${GPU_PLATFORM_NAME}" + else + echo "" + echo "Could not auto-detect GPU platform from Terraform." + read -r -p "Enter GPU platform name (e.g. H100, H200, B200, L40S): " GPU_PLATFORM_NAME + if [[ -z "$GPU_PLATFORM_NAME" ]]; then + log_error "GPU platform name is required." + exit 1 + fi + fi +fi + echo "" echo "========================================" echo " OSMO GPU Platform Configuration" +echo " Platform name: ${GPU_PLATFORM_NAME}" echo "========================================" echo "" @@ -101,21 +125,40 @@ else exit 1 fi +# ----------------------------------------------------------------------------- +# Step 2b: Create shared memory pod template +# ----------------------------------------------------------------------------- +log_info "Creating shm pod template (shared memory for vLLM, PyTorch, etc.)..." + +RESPONSE=$(osmo_curl PUT "${OSMO_URL}/api/configs/pod_template/shm" \ + -w "\n%{http_code}" \ + -d @"${SCRIPT_DIR}/shm_pod_template.json") +HTTP_CODE=$(echo "$RESPONSE" | tail -n1) +BODY=$(echo "$RESPONSE" | sed '$d') + +if [[ "$HTTP_CODE" -ge 200 && "$HTTP_CODE" -lt 300 ]]; then + log_success "Shared memory pod template created (HTTP ${HTTP_CODE})" +else + log_error "Failed to create shm pod template (HTTP ${HTTP_CODE})" + echo "Response: ${BODY}" + exit 1 +fi + # ----------------------------------------------------------------------------- # Step 3: Create GPU platform # ----------------------------------------------------------------------------- -log_info "Creating gpu platform in default pool..." +log_info "Creating platform '${GPU_PLATFORM_NAME}' in default pool..." -RESPONSE=$(osmo_curl PUT "${OSMO_URL}/api/configs/pool/default/platform/gpu" \ +RESPONSE=$(osmo_curl PUT "${OSMO_URL}/api/configs/pool/default/platform/${GPU_PLATFORM_NAME}" \ -w "\n%{http_code}" \ -d @"${SCRIPT_DIR}/gpu_platform_update.json") HTTP_CODE=$(echo "$RESPONSE" | tail -n1) BODY=$(echo "$RESPONSE" | sed '$d') if [[ "$HTTP_CODE" -ge 200 && "$HTTP_CODE" -lt 300 ]]; then - log_success "GPU platform created (HTTP ${HTTP_CODE})" + log_success "Platform '${GPU_PLATFORM_NAME}' created (HTTP ${HTTP_CODE})" else - log_error "Failed to create GPU platform (HTTP ${HTTP_CODE})" + log_error "Failed to create platform '${GPU_PLATFORM_NAME}' (HTTP ${HTTP_CODE})" echo "Response: ${BODY}" exit 1 fi @@ -130,8 +173,8 @@ echo "Pod templates:" osmo_curl GET "${OSMO_URL}/api/configs/pod_template" | jq 'keys' echo "" -echo "GPU platform config:" -osmo_curl GET "${OSMO_URL}/api/configs/pool/default" | jq '.platforms.gpu' +echo "Platform '${GPU_PLATFORM_NAME}' config:" +osmo_curl GET "${OSMO_URL}/api/configs/pool/default" | jq ".platforms.${GPU_PLATFORM_NAME}" # ----------------------------------------------------------------------------- # Step 5: Check GPU resources @@ -161,7 +204,7 @@ osmo profile set pool default 2>/dev/null && log_success "Default pool set" || l log_success "GPU platform configuration complete" echo "" echo "To submit a GPU workflow:" -echo " osmo workflow submit workflows/osmo/gpu_test.yaml -p default" +echo " osmo workflow submit workflows/osmo/gpu_test.yaml -p default --platform ${GPU_PLATFORM_NAME}" echo "" echo "Or test via curl:" echo " curl -X POST ${OSMO_URL}/api/workflow -H 'Content-Type: application/yaml' --data-binary @workflows/osmo/gpu_test.yaml" diff --git a/applications/osmo/deploy/example/002-setup/10-verify-installation.sh b/applications/osmo/deploy/example/002-setup/10-verify-installation.sh new file mode 100755 index 000000000..acd5e55f1 --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/10-verify-installation.sh @@ -0,0 +1,426 @@ +#!/bin/bash +# ============================================================================= +# OSMO Installation Verification Script +# ============================================================================= +# Checks that all required components are properly configured: +# 1. GPU Operator running with driver enabled, version 580.95.05 +# 2. /mnt/data mounted on all nodes +# 3. 64Gi /dev/shm pod template configured in OSMO +# 4. Redis sized correctly (8 vCPU, ~52.82Gi mem, 50Gi PVC) +# 5. max_num_tasks >= 200 in WORKFLOW config +# 6. Platform name is not the default "gpu" +# +# Prerequisites: +# - kubectl configured and connected to the target cluster +# (run: nebius mk8s cluster get-credentials --id --external) +# - helm CLI installed (for GPU Operator checks) +# - jq installed +# - curl installed +# - OSMO CLI installed and accessible (for osmo login) +# - Port 8080 available locally (used for port-forward to OSMO service) +# - NEBIUS_REGION set (run: source ../000-prerequisites/nebius-env-init.sh) +# +# Usage: +# ./10-verify-installation.sh +# +# Environment variables (optional overrides): +# OSMO_URL OSMO API URL (default: http://localhost:8080) +# OSMO_NAMESPACE Namespace where OSMO is deployed (default: osmo) +# GPU_OPERATOR_NAMESPACE Namespace for GPU Operator (default: gpu-operator) +# EXPECTED_DRIVER_VERSION Expected NVIDIA driver version (default: 580.95.05) +# ============================================================================= + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +OSMO_URL="${OSMO_URL:-http://localhost:8080}" +OSMO_NAMESPACE="${OSMO_NAMESPACE:-osmo}" +EXPECTED_DRIVER_VERSION="${EXPECTED_DRIVER_VERSION:-580.95.05}" +MIN_REDIS_CPU=8 +MIN_REDIS_MEMORY_GI=50 +MIN_REDIS_PVC_GI=50 +MIN_MAX_NUM_TASKS=200 +EXPECTED_SHM_SIZE="64Gi" + +PASS=0 +FAIL=0 +WARN=0 + +check_pass() { + ((PASS++)) + log_success "$1" +} + +check_fail() { + ((FAIL++)) + log_error "$1" +} + +check_warn() { + ((WARN++)) + log_warning "$1" +} + +echo "" +echo "========================================" +echo " OSMO Installation Verification" +echo "========================================" +echo "" + +# ----------------------------------------------------------------------------- +# Prerequisite checks +# ----------------------------------------------------------------------------- +log_info "Checking prerequisites..." + +PREREQ_OK=true + +if ! command -v kubectl &>/dev/null; then + log_error "kubectl not found. Install it and configure cluster access first." + PREREQ_OK=false +fi + +if ! command -v helm &>/dev/null; then + log_error "helm not found. Install helm to check GPU Operator configuration." + PREREQ_OK=false +fi + +if ! command -v jq &>/dev/null; then + log_error "jq not found. Install jq for JSON parsing." + PREREQ_OK=false +fi + +if ! command -v curl &>/dev/null; then + log_error "curl not found." + PREREQ_OK=false +fi + +if [[ "$PREREQ_OK" != "true" ]]; then + log_error "Missing prerequisites. Fix the above and re-run." + return 2>/dev/null || true +fi + +# Verify kubectl can reach the cluster +if ! kubectl cluster-info &>/dev/null; then + log_error "kubectl cannot reach the cluster. Connect first:" + echo " nebius mk8s cluster get-credentials --id --external" + return 2>/dev/null || true +fi + +CLUSTER_CONTEXT=$(kubectl config current-context 2>/dev/null || echo "unknown") +log_info "Connected to cluster: ${CLUSTER_CONTEXT}" +echo "" + +# ============================================================================= +# Check 1: GPU Operator with driver +# ============================================================================= +log_info "--- Check 1: GPU Operator & Driver ---" + +# Check GPU Operator is deployed +if helm list -n "${GPU_OPERATOR_NAMESPACE:-gpu-operator}" 2>/dev/null | grep -q gpu-operator; then + check_pass "GPU Operator helm release found" +else + check_fail "GPU Operator helm release NOT found in namespace ${GPU_OPERATOR_NAMESPACE:-gpu-operator}" +fi + +# Check driver is enabled (not disabled via --set driver.enabled=false) +DRIVER_ENABLED=$(helm get values gpu-operator -n "${GPU_OPERATOR_NAMESPACE:-gpu-operator}" -a -o json 2>/dev/null | jq -r '.driver.enabled // empty' || echo "unknown") +if [[ "$DRIVER_ENABLED" == "true" ]]; then + check_pass "GPU driver is enabled in GPU Operator" +elif [[ "$DRIVER_ENABLED" == "false" ]]; then + check_fail "GPU driver is DISABLED (driver.enabled=false) — driverless images need the operator to manage the driver" +else + check_warn "Could not determine if GPU driver is enabled" +fi + +# Check driver version by running nvidia-smi inside a nvidia-driver-daemonset pod +DRIVER_POD=$(kubectl get pods -n "${GPU_OPERATOR_NAMESPACE:-gpu-operator}" \ + -l app=nvidia-driver-daemonset --field-selector=status.phase=Running \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") +if [[ -z "$DRIVER_POD" ]]; then + check_warn "No running nvidia-driver-daemonset pod found — cannot check driver version" +else + DRIVER_NODE=$(kubectl get pod -n "${GPU_OPERATOR_NAMESPACE:-gpu-operator}" "$DRIVER_POD" \ + -o jsonpath='{.spec.nodeName}' 2>/dev/null || echo "unknown") + log_info "Running nvidia-smi in pod ${DRIVER_POD} (node ${DRIVER_NODE})..." + + NVIDIA_SMI_OUTPUT=$(kubectl exec -n "${GPU_OPERATOR_NAMESPACE:-gpu-operator}" "$DRIVER_POD" -- \ + nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1 | tr -d '[:space:]' || echo "") + + if [[ -z "$NVIDIA_SMI_OUTPUT" ]]; then + check_fail "Could not run nvidia-smi in pod ${DRIVER_POD}" + elif [[ "$NVIDIA_SMI_OUTPUT" == "$EXPECTED_DRIVER_VERSION" ]]; then + check_pass "nvidia-smi driver version: ${NVIDIA_SMI_OUTPUT} (on ${DRIVER_NODE})" + else + check_fail "nvidia-smi driver version: ${NVIDIA_SMI_OUTPUT} (expected ${EXPECTED_DRIVER_VERSION}, on ${DRIVER_NODE})" + fi +fi + +# Check nvidia-driver-daemonset pods are running +DRIVER_PODS=$(kubectl get pods -n "${GPU_OPERATOR_NAMESPACE:-gpu-operator}" -l app=nvidia-driver-daemonset --no-headers 2>/dev/null | wc -l | tr -d ' ') +DRIVER_PODS_READY=$(kubectl get pods -n "${GPU_OPERATOR_NAMESPACE:-gpu-operator}" -l app=nvidia-driver-daemonset --no-headers 2>/dev/null | grep -c "Running" || echo "0") +if [[ "$DRIVER_PODS" -gt 0 ]]; then + if [[ "$DRIVER_PODS_READY" -eq "$DRIVER_PODS" ]]; then + check_pass "nvidia-driver-daemonset: ${DRIVER_PODS_READY}/${DRIVER_PODS} pods Running" + else + check_fail "nvidia-driver-daemonset: ${DRIVER_PODS_READY}/${DRIVER_PODS} pods Running" + fi +else + check_warn "No nvidia-driver-daemonset pods found (expected when driver.enabled=true)" +fi + +# ============================================================================= +# Check 2: /mnt/data mounted on a GPU node +# ============================================================================= +echo "" +log_info "--- Check 2: /mnt/data on GPU node ---" + +GPU_NODE=$(kubectl get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) +if [[ -z "$GPU_NODE" ]]; then + check_warn "No GPU nodes found — cannot verify /mnt/data" +else + POD_NAME="mnt-check-verify" + kubectl delete pod "$POD_NAME" --force --grace-period=0 &>/dev/null || true + kubectl run "$POD_NAME" --image=busybox --restart=Never \ + --overrides="{ + \"spec\":{ + \"nodeName\":\"${GPU_NODE}\", + \"containers\":[{ + \"name\":\"check\", + \"image\":\"busybox\", + \"command\":[\"sh\",\"-c\",\"grep -q ' /host-mnt/data ' /host-proc/mounts && echo MOUNTED || echo NOT_MOUNTED\"], + \"volumeMounts\":[ + {\"name\":\"host-proc\",\"mountPath\":\"/host-proc\",\"readOnly\":true}, + {\"name\":\"host-mnt\",\"mountPath\":\"/host-mnt\",\"readOnly\":true} + ] + }], + \"volumes\":[ + {\"name\":\"host-proc\",\"hostPath\":{\"path\":\"/proc\",\"type\":\"Directory\"}}, + {\"name\":\"host-mnt\",\"hostPath\":{\"path\":\"/mnt\",\"type\":\"Directory\"}} + ], + \"tolerations\":[{\"operator\":\"Exists\"}], + \"restartPolicy\":\"Never\" + } + }" &>/dev/null + + kubectl wait --for=jsonpath='{.status.phase}'=Succeeded "pod/$POD_NAME" --timeout=30s &>/dev/null + + RESULT=$(kubectl logs "$POD_NAME" 2>/dev/null | tail -1) + kubectl delete pod "$POD_NAME" --force --grace-period=0 &>/dev/null || true + + if [[ "$RESULT" == "MOUNTED" ]]; then + check_pass "GPU node ${GPU_NODE}: /mnt/data mounted" + elif [[ "$RESULT" == "NOT_MOUNTED" ]]; then + check_fail "GPU node ${GPU_NODE}: /mnt/data NOT mounted" + else + check_warn "GPU node ${GPU_NODE}: could not verify /mnt/data" + fi +fi + +# ============================================================================= +# Check 3–6 require OSMO API access via port-forward +# ============================================================================= +echo "" +log_info "--- Setting up OSMO API access ---" + +# Start port-forward for OSMO API checks +start_osmo_port_forward "${OSMO_NAMESPACE}" 8080 + +cleanup_port_forward() { + if [[ -n "${PORT_FORWARD_PID:-}" ]]; then + kill $PORT_FORWARD_PID 2>/dev/null || true + wait $PORT_FORWARD_PID 2>/dev/null || true + fi +} +trap cleanup_port_forward EXIT RETURN + +# Wait for port-forward +max_wait=15 +elapsed=0 +while ! curl -s -o /dev/null -w "%{http_code}" "${OSMO_URL}/api/version" 2>/dev/null | grep -q "200\|401\|403"; do + sleep 1 + ((elapsed += 1)) + if [[ $elapsed -ge $max_wait ]]; then + log_error "Port-forward failed to start within ${max_wait}s" + log_error "Skipping OSMO API checks (3–6). Ensure OSMO is running in namespace '${OSMO_NAMESPACE}'." + cleanup_port_forward + # Print partial summary and return + echo "" + echo "========================================" + echo " Verification Summary (partial)" + echo "========================================" + echo "" + echo -e " ${GREEN}Passed: ${PASS}${NC}" + echo -e " ${RED}Failed: ${FAIL}${NC}" + echo -e " ${YELLOW}Warnings: ${WARN}${NC}" + echo -e " Skipped: checks 3–6 (OSMO API unreachable)" + echo "" + return 2>/dev/null || true + fi +done + +osmo_login 8080 || true + +# ============================================================================= +# Check 3: Shared memory pod template (64Gi /dev/shm) +# ============================================================================= +echo "" +log_info "--- Check 3: Shared memory pod template ---" + +SHM_TEMPLATE=$(osmo_curl GET "${OSMO_URL}/api/configs/pod_template/shm" 2>/dev/null || echo "") +if [[ -n "$SHM_TEMPLATE" && "$SHM_TEMPLATE" != "null" && "$SHM_TEMPLATE" != "{}" ]]; then + # Check sizeLimit (API may return under .configs.spec or .spec) + SHM_SIZE=$(echo "$SHM_TEMPLATE" | jq -r '(.configs.spec // .spec).volumes[]? | select(.name=="shm") | .emptyDir.sizeLimit // empty' 2>/dev/null || echo "") + if [[ "$SHM_SIZE" == "$EXPECTED_SHM_SIZE" ]]; then + check_pass "shm pod template: sizeLimit=${SHM_SIZE}" + elif [[ -n "$SHM_SIZE" ]]; then + check_fail "shm pod template: sizeLimit=${SHM_SIZE} (expected ${EXPECTED_SHM_SIZE})" + else + check_warn "shm pod template exists but could not read sizeLimit" + fi + + # Check /dev/shm mount + SHM_MOUNT=$(echo "$SHM_TEMPLATE" | jq -r '(.configs.spec // .spec).containers[]?.volumeMounts[]? | select(.mountPath=="/dev/shm") | .name // empty' 2>/dev/null || echo "") + if [[ "$SHM_MOUNT" == "shm" ]]; then + check_pass "shm pod template: /dev/shm volumeMount configured" + else + check_fail "shm pod template: /dev/shm volumeMount NOT found" + fi +else + check_fail "shm pod template NOT found in OSMO" +fi + +# ============================================================================= +# Check 4: Redis configuration (8 vCPU, ~52.82Gi mem, 50Gi PVC) +# ============================================================================= +echo "" +log_info "--- Check 4: Redis resources ---" + +REDIS_STS=$(kubectl get statefulset redis-master -n "${OSMO_NAMESPACE}" -o json 2>/dev/null || echo "") +if [[ -z "$REDIS_STS" || "$REDIS_STS" == "" ]]; then + check_fail "Redis statefulset 'redis-master' not found in namespace ${OSMO_NAMESPACE}" +else + # CPU requests + REDIS_CPU=$(echo "$REDIS_STS" | jq -r '.spec.template.spec.containers[] | select(.name=="redis") | .resources.requests.cpu // empty' 2>/dev/null || echo "") + REDIS_CPU_NUM=$(echo "$REDIS_CPU" | sed 's/m$//' || echo "0") + if [[ "$REDIS_CPU" =~ m$ ]]; then + REDIS_CPU_CORES=$((REDIS_CPU_NUM / 1000)) + else + REDIS_CPU_CORES=$REDIS_CPU_NUM + fi + + if [[ "$REDIS_CPU_CORES" -ge "$MIN_REDIS_CPU" ]] 2>/dev/null; then + check_pass "Redis CPU requests: ${REDIS_CPU} (>= ${MIN_REDIS_CPU} cores)" + else + check_fail "Redis CPU requests: ${REDIS_CPU} (expected >= ${MIN_REDIS_CPU} cores)" + fi + + # Memory requests + REDIS_MEM=$(echo "$REDIS_STS" | jq -r '.spec.template.spec.containers[] | select(.name=="redis") | .resources.requests.memory // empty' 2>/dev/null || echo "") + REDIS_MEM_NUM=$(echo "$REDIS_MEM" | sed -E 's/[A-Za-z]+$//') + REDIS_MEM_UNIT=$(echo "$REDIS_MEM" | sed -E 's/^[0-9.]+//') + case "$REDIS_MEM_UNIT" in + Gi) REDIS_MEM_GI=$REDIS_MEM_NUM ;; + Mi) REDIS_MEM_GI=$((REDIS_MEM_NUM / 1024)) ;; + *) REDIS_MEM_GI=0 ;; + esac + + if [[ "$REDIS_MEM_GI" -ge "$MIN_REDIS_MEMORY_GI" ]] 2>/dev/null; then + check_pass "Redis memory requests: ${REDIS_MEM} (>= ${MIN_REDIS_MEMORY_GI}Gi)" + else + check_fail "Redis memory requests: ${REDIS_MEM} (expected >= ${MIN_REDIS_MEMORY_GI}Gi)" + fi + + # PVC size + REDIS_PVC_SIZE=$(kubectl get pvc -n "${OSMO_NAMESPACE}" -l app.kubernetes.io/name=redis --no-headers -o jsonpath='{.items[0].spec.resources.requests.storage}' 2>/dev/null || echo "") + REDIS_PVC_NUM=$(echo "$REDIS_PVC_SIZE" | sed -E 's/[A-Za-z]+$//') + REDIS_PVC_UNIT=$(echo "$REDIS_PVC_SIZE" | sed -E 's/^[0-9.]+//') + case "$REDIS_PVC_UNIT" in + Gi) REDIS_PVC_GI=$REDIS_PVC_NUM ;; + Ti) REDIS_PVC_GI=$((REDIS_PVC_NUM * 1024)) ;; + *) REDIS_PVC_GI=0 ;; + esac + + if [[ "$REDIS_PVC_GI" -ge "$MIN_REDIS_PVC_GI" ]] 2>/dev/null; then + check_pass "Redis PVC size: ${REDIS_PVC_SIZE} (>= ${MIN_REDIS_PVC_GI}Gi)" + else + check_fail "Redis PVC size: ${REDIS_PVC_SIZE:-unknown} (expected >= ${MIN_REDIS_PVC_GI}Gi)" + fi +fi + +# ============================================================================= +# Check 5: max_num_tasks >= 200 +# ============================================================================= +echo "" +log_info "--- Check 5: WORKFLOW max_num_tasks ---" + +WORKFLOW_CONFIG=$(osmo_curl GET "${OSMO_URL}/api/configs/workflow" 2>/dev/null || echo "") +if [[ -n "$WORKFLOW_CONFIG" && "$WORKFLOW_CONFIG" != "null" ]]; then + MAX_NUM_TASKS=$(echo "$WORKFLOW_CONFIG" | jq -r '.max_num_tasks // .configs_dict.max_num_tasks // empty' 2>/dev/null || echo "") + if [[ -z "$MAX_NUM_TASKS" ]]; then + check_fail "max_num_tasks not set in WORKFLOW config (default is too low)" + elif [[ "$MAX_NUM_TASKS" -ge "$MIN_MAX_NUM_TASKS" ]] 2>/dev/null; then + check_pass "max_num_tasks: ${MAX_NUM_TASKS} (>= ${MIN_MAX_NUM_TASKS})" + else + check_fail "max_num_tasks: ${MAX_NUM_TASKS} (expected >= ${MIN_MAX_NUM_TASKS})" + fi +else + check_fail "Could not retrieve WORKFLOW config from OSMO API" +fi + +# ============================================================================= +# Check 6: Platform name is not default "gpu" +# ============================================================================= +echo "" +log_info "--- Check 6: Platform naming ---" + +POOL_CONFIG=$(osmo_curl GET "${OSMO_URL}/api/configs/pool/default" 2>/dev/null || echo "") +if [[ -n "$POOL_CONFIG" && "$POOL_CONFIG" != "null" ]]; then + PLATFORM_NAMES=$(echo "$POOL_CONFIG" | jq -r '.platforms // {} | keys[]' 2>/dev/null || echo "") + if [[ -z "$PLATFORM_NAMES" ]]; then + check_fail "No platforms found in default pool" + else + GPU_TYPE_FOUND=false + GENERIC_LIST="" + for NAME in $PLATFORM_NAMES; do + # Platform name must identify the GPU type (e.g. H100, H200, B200, L40S) + if echo "$NAME" | grep -qiE '^(h100|h200|b200|b300|l40s|a100|a10)'; then + check_pass "Platform '${NAME}': name identifies GPU type" + GPU_TYPE_FOUND=true + else + GENERIC_LIST="${GENERIC_LIST} ${NAME}" + fi + done + if [[ "$GPU_TYPE_FOUND" == "false" ]]; then + check_fail "No GPU-type platform found (only generic:${GENERIC_LIST}) — create one named after the GPU (e.g. H100)" + elif [[ -n "$GENERIC_LIST" ]]; then + log_info "Also found generic platforms:${GENERIC_LIST} (cannot be deleted, ignored)" + fi + fi +else + check_fail "Could not retrieve pool config from OSMO API" +fi + +# ============================================================================= +# Summary +# ============================================================================= +cleanup_port_forward +trap - EXIT RETURN + +echo "" +echo "========================================" +echo " Verification Summary" +echo "========================================" +echo "" +echo -e " ${GREEN}Passed: ${PASS}${NC}" +echo -e " ${RED}Failed: ${FAIL}${NC}" +echo -e " ${YELLOW}Warnings: ${WARN}${NC}" +echo "" + +if [[ "$FAIL" -gt 0 ]]; then + log_error "Installation has ${FAIL} issue(s) that need to be fixed." +elif [[ "$WARN" -gt 0 ]]; then + log_warning "Installation looks OK but has ${WARN} warning(s) to review." +else + log_success "All checks passed!" +fi diff --git a/applications/osmo/deploy/example/002-setup/11-connect-remote-control-plane.sh b/applications/osmo/deploy/example/002-setup/11-connect-remote-control-plane.sh new file mode 100755 index 000000000..a5053374d --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/11-connect-remote-control-plane.sh @@ -0,0 +1,190 @@ +#!/bin/bash +# +# Connect OSMO Backend to a Remote Control Plane +# +# Reconfigures an already-deployed backend operator to point to a remote +# control plane at a different URL (e.g. in another K8s cluster). +# +# Required inputs (env vars or positional args): +# REMOTE_CONTROL_PLANE_URL — external HTTPS URL of the remote control plane +# REMOTE_SERVICE_TOKEN — service token from the remote control plane +# +# Usage: +# export REMOTE_CONTROL_PLANE_URL=https://os1.eu-north1.osmo.nebius.cloud +# export REMOTE_SERVICE_TOKEN= +# ./11-connect-remote-control-plane.sh +# +# Or with positional args: +# ./11-connect-remote-control-plane.sh +# + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" +source "${SCRIPT_DIR}/defaults.sh" + +echo "" +echo "========================================" +echo " Connect Backend to Remote Control Plane" +echo "========================================" +echo "" + +# Check prerequisites +check_kubectl || exit 1 +check_helm || exit 1 + +# ----------------------------------------------------------------------------- +# Configuration +# ----------------------------------------------------------------------------- +OSMO_OPERATOR_NAMESPACE="${OSMO_OPERATOR_NAMESPACE:-osmo-operator}" +OSMO_WORKFLOWS_NAMESPACE="${OSMO_WORKFLOWS_NAMESPACE:-osmo-workflows}" +OSMO_IMAGE_TAG="${OSMO_IMAGE_TAG:-6.0.0}" +BACKEND_NAME="${OSMO_BACKEND_NAME:-default}" + +# Accept positional args or env vars +REMOTE_CONTROL_PLANE_URL="${1:-${REMOTE_CONTROL_PLANE_URL:-}}" +REMOTE_SERVICE_TOKEN="${2:-${REMOTE_SERVICE_TOKEN:-}}" + +# Validate required inputs +if [[ -z "$REMOTE_CONTROL_PLANE_URL" ]]; then + log_error "REMOTE_CONTROL_PLANE_URL is required." + echo "" + echo "Usage:" + echo " export REMOTE_CONTROL_PLANE_URL=https://os1.eu-north1.osmo.nebius.cloud" + echo " export REMOTE_SERVICE_TOKEN=" + echo " ./11-connect-remote-control-plane.sh" + echo "" + echo " Or: ./11-connect-remote-control-plane.sh " + exit 1 +fi + +if [[ -z "$REMOTE_SERVICE_TOKEN" ]]; then + log_error "REMOTE_SERVICE_TOKEN is required." + echo "" + echo "Generate a service token on the remote control plane:" + echo " osmo token set backend-token-\$(date +%s) --service --roles osmo-backend --expires-at 2027-01-01" + echo "" + echo "Then export it:" + echo " export REMOTE_SERVICE_TOKEN=" + exit 1 +fi + +# Strip trailing slash from URL +REMOTE_CONTROL_PLANE_URL="${REMOTE_CONTROL_PLANE_URL%/}" + +log_info "Remote control plane URL: ${REMOTE_CONTROL_PLANE_URL}" +log_info "Backend name: ${BACKEND_NAME}" +log_info "Operator namespace: ${OSMO_OPERATOR_NAMESPACE}" + +# ----------------------------------------------------------------------------- +# Verify kubectl is connected +# ----------------------------------------------------------------------------- +log_info "Current kubectl context:" +kubectl config current-context +echo "" + +# ----------------------------------------------------------------------------- +# Test remote control plane reachability +# ----------------------------------------------------------------------------- +log_info "Testing remote control plane reachability..." + +HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 10 "${REMOTE_CONTROL_PLANE_URL}/api/version" 2>/dev/null || echo "000") + +if [[ "$HTTP_CODE" == "000" ]]; then + log_error "Cannot reach ${REMOTE_CONTROL_PLANE_URL}/api/version (connection failed)" + log_error "Check the URL and ensure the remote control plane is accessible from this network." + exit 1 +elif [[ "$HTTP_CODE" =~ ^(200|401|403)$ ]]; then + log_success "Remote control plane reachable (HTTP ${HTTP_CODE})" +else + log_warning "Remote control plane returned HTTP ${HTTP_CODE} — proceeding anyway" +fi + +# ----------------------------------------------------------------------------- +# Check that osmo-operator release exists +# ----------------------------------------------------------------------------- +log_info "Checking for existing osmo-operator Helm release..." + +if ! helm status osmo-operator -n "${OSMO_OPERATOR_NAMESPACE}" &>/dev/null; then + log_error "No osmo-operator Helm release found in namespace ${OSMO_OPERATOR_NAMESPACE}" + log_error "Deploy the backend operator first: ./05-deploy-osmo-backend.sh" + exit 1 +fi +log_success "osmo-operator release found" + +# ----------------------------------------------------------------------------- +# Create/update the osmo-operator-token secret +# ----------------------------------------------------------------------------- +log_info "Updating osmo-operator-token secret..." + +kubectl create secret generic osmo-operator-token \ + --namespace "${OSMO_OPERATOR_NAMESPACE}" \ + --from-literal=token="${REMOTE_SERVICE_TOKEN}" \ + --dry-run=client -o yaml | kubectl apply -f - + +log_success "osmo-operator-token secret updated" + +# ----------------------------------------------------------------------------- +# Helm upgrade — update global.serviceUrl, keep everything else +# ----------------------------------------------------------------------------- +log_info "Updating osmo-operator Helm release with remote service URL..." + +helm upgrade osmo-operator osmo/backend-operator \ + --namespace "${OSMO_OPERATOR_NAMESPACE}" \ + --reuse-values \ + --set "global.serviceUrl=${REMOTE_CONTROL_PLANE_URL}" \ + --wait \ + --timeout 5m + +log_success "Helm release updated with serviceUrl=${REMOTE_CONTROL_PLANE_URL}" + +# ----------------------------------------------------------------------------- +# Wait for backend-listener pod to restart +# ----------------------------------------------------------------------------- +log_info "Waiting for backend-listener pod to be ready..." + +# Give the rollout a moment to start +sleep 3 + +# Wait for all pods in the operator namespace to be ready +kubectl rollout status deployment -n "${OSMO_OPERATOR_NAMESPACE}" --timeout=120s 2>/dev/null || true + +# Check backend-listener pod status +LISTENER_POD=$(kubectl get pods -n "${OSMO_OPERATOR_NAMESPACE}" -l app=backend-listener -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + +if [[ -n "$LISTENER_POD" ]]; then + log_info "Checking backend-listener logs for connection status..." + # Wait a few seconds for connection attempt + sleep 5 + kubectl logs -n "${OSMO_OPERATOR_NAMESPACE}" "$LISTENER_POD" --tail=20 2>/dev/null || true + echo "" +else + log_warning "No backend-listener pod found — check deployment status" +fi + +# ----------------------------------------------------------------------------- +# Print status +# ----------------------------------------------------------------------------- +echo "" +kubectl get pods -n "${OSMO_OPERATOR_NAMESPACE}" + +echo "" +echo "========================================" +log_success "Backend connected to remote control plane!" +echo "========================================" +echo "" +echo "Remote Control Plane: ${REMOTE_CONTROL_PLANE_URL}" +echo "Backend Name: ${BACKEND_NAME}" +echo "Operator Namespace: ${OSMO_OPERATOR_NAMESPACE}" +echo "" +echo "To verify the backend is online on the remote control plane:" +echo "" +echo " curl ${REMOTE_CONTROL_PLANE_URL}/api/configs/backend" +echo "" +echo " Or using osmo CLI (logged into the remote control plane):" +echo " osmo config show BACKEND ${BACKEND_NAME}" +echo "" +echo "To check backend-listener logs:" +echo " kubectl logs -n ${OSMO_OPERATOR_NAMESPACE} -l app=backend-listener -f" +echo "" diff --git a/applications/osmo/deploy/example/002-setup/gpu_platform_update.json b/applications/osmo/deploy/example/002-setup/gpu_platform_update.json index da10c6a05..1237894e7 100755 --- a/applications/osmo/deploy/example/002-setup/gpu_platform_update.json +++ b/applications/osmo/deploy/example/002-setup/gpu_platform_update.json @@ -1,6 +1,6 @@ { "configs": { - "description": "GPU platform for L40S nodes", + "description": "GPU platform", "host_network_allowed": false, "privileged_allowed": false, "allowed_mounts": ["/mnt/data"], @@ -9,6 +9,6 @@ "USER_GPU": 1 }, "resource_validations": [], - "override_pod_template": ["gpu_tolerations"] + "override_pod_template": ["gpu_tolerations", "shm"] } } From 131accf71ce06e497594f20fc2189d3b03e983c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Thu, 5 Mar 2026 21:41:18 +0100 Subject: [PATCH 32/37] - add shared memory pod template --- .../example/002-setup/shm_pod_template.json | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 applications/osmo/deploy/example/002-setup/shm_pod_template.json diff --git a/applications/osmo/deploy/example/002-setup/shm_pod_template.json b/applications/osmo/deploy/example/002-setup/shm_pod_template.json new file mode 100644 index 000000000..c7876c5db --- /dev/null +++ b/applications/osmo/deploy/example/002-setup/shm_pod_template.json @@ -0,0 +1,21 @@ +{ + "configs": { + "spec": { + "containers": [{ + "name": "{{USER_CONTAINER_NAME}}", + "volumeMounts": [{ + "name": "shm", + "mountPath": "/dev/shm" + }] + }], + "volumes": [{ + "name": "shm", + "emptyDir": { + "medium": "Memory", + "sizeLimit": "64Gi" + } + }] + } + }, + "description": "Add shared memory volume for IPC (PyTorch, vLLM, TensorRT, etc.)" +} From 19374c46cf08709da80fa3e40e5944bdc1a00d9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Thu, 5 Mar 2026 21:46:30 +0100 Subject: [PATCH 33/37] - reorder scripts --- .../osmo/deploy/example/002-setup/04-enable-tls.sh | 2 +- ...ontrol-plane.sh => 05-deploy-osmo-control-plane.sh} | 10 +++++----- ...eploy-osmo-backend.sh => 06-deploy-osmo-backend.sh} | 4 ++-- ...06-configure-storage.sh => 07-configure-storage.sh} | 0 ...gure-service-url.sh => 08-configure-service-url.sh} | 0 ...re-gpu-platform.sh => 09-configure-gpu-platform.sh} | 0 ...-scheduler.sh => 10-configure-backend-scheduler.sh} | 0 ...ataset-bucket.sh => 11-configure-dataset-bucket.sh} | 0 ...erify-installation.sh => 12-verify-installation.sh} | 2 +- ...ol-plane.sh => 99a-connect-remote-control-plane.sh} | 10 +++++----- ...credentials.sh => 99b-show-keycloak-credentials.sh} | 0 11 files changed, 14 insertions(+), 14 deletions(-) rename applications/osmo/deploy/example/002-setup/{04-deploy-osmo-control-plane.sh => 05-deploy-osmo-control-plane.sh} (99%) rename applications/osmo/deploy/example/002-setup/{05-deploy-osmo-backend.sh => 06-deploy-osmo-backend.sh} (99%) rename applications/osmo/deploy/example/002-setup/{06-configure-storage.sh => 07-configure-storage.sh} (100%) rename applications/osmo/deploy/example/002-setup/{07-configure-service-url.sh => 08-configure-service-url.sh} (100%) rename applications/osmo/deploy/example/002-setup/{08-configure-gpu-platform.sh => 09-configure-gpu-platform.sh} (100%) rename applications/osmo/deploy/example/002-setup/{09-configure-backend-scheduler.sh => 10-configure-backend-scheduler.sh} (100%) rename applications/osmo/deploy/example/002-setup/{10-configure-dataset-bucket.sh => 11-configure-dataset-bucket.sh} (100%) rename applications/osmo/deploy/example/002-setup/{10-verify-installation.sh => 12-verify-installation.sh} (99%) rename applications/osmo/deploy/example/002-setup/{11-connect-remote-control-plane.sh => 99a-connect-remote-control-plane.sh} (96%) rename applications/osmo/deploy/example/002-setup/{12-show-keycloak-credentials.sh => 99b-show-keycloak-credentials.sh} (100%) diff --git a/applications/osmo/deploy/example/002-setup/04-enable-tls.sh b/applications/osmo/deploy/example/002-setup/04-enable-tls.sh index bb9ae2a96..b6b25fb47 100755 --- a/applications/osmo/deploy/example/002-setup/04-enable-tls.sh +++ b/applications/osmo/deploy/example/002-setup/04-enable-tls.sh @@ -732,7 +732,7 @@ else echo " 1. Certificates stored in: ${CERT_DIR}" echo " Renewal: re-run this script before the 90-day expiry" fi - echo " 2. Deploy OSMO: ./04-deploy-osmo-control-plane.sh" + echo " 2. Deploy OSMO: ./05-deploy-osmo-control-plane.sh" echo " (It will auto-detect the TLS cert and create HTTPS Ingress)" if [[ -n "$AUTH_HOSTNAME" ]]; then echo " 3. Keycloak will be exposed at https://${AUTH_HOSTNAME}" diff --git a/applications/osmo/deploy/example/002-setup/04-deploy-osmo-control-plane.sh b/applications/osmo/deploy/example/002-setup/05-deploy-osmo-control-plane.sh similarity index 99% rename from applications/osmo/deploy/example/002-setup/04-deploy-osmo-control-plane.sh rename to applications/osmo/deploy/example/002-setup/05-deploy-osmo-control-plane.sh index 1052b9db9..a97c5990a 100755 --- a/applications/osmo/deploy/example/002-setup/04-deploy-osmo-control-plane.sh +++ b/applications/osmo/deploy/example/002-setup/05-deploy-osmo-control-plane.sh @@ -1868,7 +1868,7 @@ elif [[ -n "$INGRESS_URL" ]]; then log_info "Auto-detected service URL: ${TARGET_SERVICE_URL}" else log_warning "Could not detect Ingress URL. Skipping service_base_url configuration." - log_warning "Run ./07-configure-service-url.sh manually after verifying the Ingress." + log_warning "Run ./08-configure-service-url.sh manually after verifying the Ingress." TARGET_SERVICE_URL="" fi @@ -1920,15 +1920,15 @@ SVCEOF if [[ "$NEW_SVC_URL" == "$TARGET_SERVICE_URL" ]]; then log_success "service_base_url configured: ${NEW_SVC_URL}" else - log_warning "service_base_url verification failed. Run ./07-configure-service-url.sh manually." + log_warning "service_base_url verification failed. Run ./08-configure-service-url.sh manually." fi else - log_warning "Failed to set service_base_url. Run ./07-configure-service-url.sh manually." + log_warning "Failed to set service_base_url. Run ./08-configure-service-url.sh manually." fi rm -f /tmp/service_url_fix.json fi else - log_warning "Port-forward not ready. Run ./07-configure-service-url.sh manually." + log_warning "Port-forward not ready. Run ./08-configure-service-url.sh manually." fi _cleanup_pf @@ -2009,5 +2009,5 @@ echo "Ingress resources:" kubectl get ingress -n "${OSMO_NAMESPACE}" 2>/dev/null || true echo "" echo "Next step - Deploy Backend Operator:" -echo " ./05-deploy-osmo-backend.sh" +echo " ./06-deploy-osmo-backend.sh" echo "" diff --git a/applications/osmo/deploy/example/002-setup/05-deploy-osmo-backend.sh b/applications/osmo/deploy/example/002-setup/06-deploy-osmo-backend.sh similarity index 99% rename from applications/osmo/deploy/example/002-setup/05-deploy-osmo-backend.sh rename to applications/osmo/deploy/example/002-setup/06-deploy-osmo-backend.sh index 5f1d5767c..e720c9cb5 100755 --- a/applications/osmo/deploy/example/002-setup/05-deploy-osmo-backend.sh +++ b/applications/osmo/deploy/example/002-setup/06-deploy-osmo-backend.sh @@ -50,7 +50,7 @@ if [[ -z "${OSMO_SERVICE_URL:-}" ]]; then log_success "In-cluster Agent URL: ${OSMO_SERVICE_URL}" else echo "" - log_error "Could not detect OSMO Agent service. Deploy OSMO first: ./04-deploy-osmo-control-plane.sh" + log_error "Could not detect OSMO Agent service. Deploy OSMO first: ./05-deploy-osmo-control-plane.sh" log_error "Note: Backend operators require osmo-agent service for WebSocket connections" exit 1 fi @@ -406,5 +406,5 @@ else fi echo "" echo "Next step - Configure Storage:" -echo " ./06-configure-storage.sh" +echo " ./07-configure-storage.sh" echo "" diff --git a/applications/osmo/deploy/example/002-setup/06-configure-storage.sh b/applications/osmo/deploy/example/002-setup/07-configure-storage.sh similarity index 100% rename from applications/osmo/deploy/example/002-setup/06-configure-storage.sh rename to applications/osmo/deploy/example/002-setup/07-configure-storage.sh diff --git a/applications/osmo/deploy/example/002-setup/07-configure-service-url.sh b/applications/osmo/deploy/example/002-setup/08-configure-service-url.sh similarity index 100% rename from applications/osmo/deploy/example/002-setup/07-configure-service-url.sh rename to applications/osmo/deploy/example/002-setup/08-configure-service-url.sh diff --git a/applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh b/applications/osmo/deploy/example/002-setup/09-configure-gpu-platform.sh similarity index 100% rename from applications/osmo/deploy/example/002-setup/08-configure-gpu-platform.sh rename to applications/osmo/deploy/example/002-setup/09-configure-gpu-platform.sh diff --git a/applications/osmo/deploy/example/002-setup/09-configure-backend-scheduler.sh b/applications/osmo/deploy/example/002-setup/10-configure-backend-scheduler.sh similarity index 100% rename from applications/osmo/deploy/example/002-setup/09-configure-backend-scheduler.sh rename to applications/osmo/deploy/example/002-setup/10-configure-backend-scheduler.sh diff --git a/applications/osmo/deploy/example/002-setup/10-configure-dataset-bucket.sh b/applications/osmo/deploy/example/002-setup/11-configure-dataset-bucket.sh similarity index 100% rename from applications/osmo/deploy/example/002-setup/10-configure-dataset-bucket.sh rename to applications/osmo/deploy/example/002-setup/11-configure-dataset-bucket.sh diff --git a/applications/osmo/deploy/example/002-setup/10-verify-installation.sh b/applications/osmo/deploy/example/002-setup/12-verify-installation.sh similarity index 99% rename from applications/osmo/deploy/example/002-setup/10-verify-installation.sh rename to applications/osmo/deploy/example/002-setup/12-verify-installation.sh index acd5e55f1..624832d7a 100755 --- a/applications/osmo/deploy/example/002-setup/10-verify-installation.sh +++ b/applications/osmo/deploy/example/002-setup/12-verify-installation.sh @@ -21,7 +21,7 @@ # - NEBIUS_REGION set (run: source ../000-prerequisites/nebius-env-init.sh) # # Usage: -# ./10-verify-installation.sh +# ./12-verify-installation.sh # # Environment variables (optional overrides): # OSMO_URL OSMO API URL (default: http://localhost:8080) diff --git a/applications/osmo/deploy/example/002-setup/11-connect-remote-control-plane.sh b/applications/osmo/deploy/example/002-setup/99a-connect-remote-control-plane.sh similarity index 96% rename from applications/osmo/deploy/example/002-setup/11-connect-remote-control-plane.sh rename to applications/osmo/deploy/example/002-setup/99a-connect-remote-control-plane.sh index a5053374d..824c2e1da 100755 --- a/applications/osmo/deploy/example/002-setup/11-connect-remote-control-plane.sh +++ b/applications/osmo/deploy/example/002-setup/99a-connect-remote-control-plane.sh @@ -12,10 +12,10 @@ # Usage: # export REMOTE_CONTROL_PLANE_URL=https://os1.eu-north1.osmo.nebius.cloud # export REMOTE_SERVICE_TOKEN= -# ./11-connect-remote-control-plane.sh +# ./99a-connect-remote-control-plane.sh # # Or with positional args: -# ./11-connect-remote-control-plane.sh +# ./99a-connect-remote-control-plane.sh # set -e @@ -53,9 +53,9 @@ if [[ -z "$REMOTE_CONTROL_PLANE_URL" ]]; then echo "Usage:" echo " export REMOTE_CONTROL_PLANE_URL=https://os1.eu-north1.osmo.nebius.cloud" echo " export REMOTE_SERVICE_TOKEN=" - echo " ./11-connect-remote-control-plane.sh" + echo " ./99a-connect-remote-control-plane.sh" echo "" - echo " Or: ./11-connect-remote-control-plane.sh " + echo " Or: ./99a-connect-remote-control-plane.sh " exit 1 fi @@ -108,7 +108,7 @@ log_info "Checking for existing osmo-operator Helm release..." if ! helm status osmo-operator -n "${OSMO_OPERATOR_NAMESPACE}" &>/dev/null; then log_error "No osmo-operator Helm release found in namespace ${OSMO_OPERATOR_NAMESPACE}" - log_error "Deploy the backend operator first: ./05-deploy-osmo-backend.sh" + log_error "Deploy the backend operator first: ./06-deploy-osmo-backend.sh" exit 1 fi log_success "osmo-operator release found" diff --git a/applications/osmo/deploy/example/002-setup/12-show-keycloak-credentials.sh b/applications/osmo/deploy/example/002-setup/99b-show-keycloak-credentials.sh similarity index 100% rename from applications/osmo/deploy/example/002-setup/12-show-keycloak-credentials.sh rename to applications/osmo/deploy/example/002-setup/99b-show-keycloak-credentials.sh From ed4780592eef4940a897289680da1c27169b78bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Tue, 10 Mar 2026 10:05:41 +0100 Subject: [PATCH 34/37] - more test workflows --- .../osmo/workflows/osmo/test_gpu_driver.yaml | 53 ++++++ .../osmo/workflows/osmo/test_mnt_data.yaml | 64 ++++++++ .../osmo/workflows/osmo/test_multi_gpu.yaml | 57 +++++++ .../osmo/test_shared_fs_cross_node.yaml | 155 ++++++++++++++++++ .../osmo/workflows/osmo/test_shm.yaml | 75 +++++++++ 5 files changed, 404 insertions(+) create mode 100644 applications/osmo/workflows/osmo/test_gpu_driver.yaml create mode 100644 applications/osmo/workflows/osmo/test_mnt_data.yaml create mode 100644 applications/osmo/workflows/osmo/test_multi_gpu.yaml create mode 100644 applications/osmo/workflows/osmo/test_shared_fs_cross_node.yaml create mode 100644 applications/osmo/workflows/osmo/test_shm.yaml diff --git a/applications/osmo/workflows/osmo/test_gpu_driver.yaml b/applications/osmo/workflows/osmo/test_gpu_driver.yaml new file mode 100644 index 000000000..a150e91a5 --- /dev/null +++ b/applications/osmo/workflows/osmo/test_gpu_driver.yaml @@ -0,0 +1,53 @@ +# GPU Driver Version Test Workflow +# Validates that the correct NVIDIA driver is installed (580.95.05) +# and that CUDA is functional on H100 GPUs. +# +# Submit with: +# osmo workflow submit workflows/osmo/test_gpu_driver.yaml + +workflow: + name: test-gpu-driver + resources: + gpu-resource: + platform: H100 + gpu: 1 + cpu: 2 + memory: 4Gi + tasks: + - name: check-driver + image: nvidia/cuda:13.0.2-base-ubuntu24.04 + command: ["bash", "-c"] + args: + - | + echo "=== GPU Driver Verification ===" + echo "" + EXPECTED="580.95.05" + + echo "--- Driver version ---" + DRIVER=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1 | tr -d '[:space:]') + echo "Installed: ${DRIVER}" + echo "Expected: ${EXPECTED}" + if [ "$DRIVER" = "$EXPECTED" ]; then + echo "PASS: driver version matches" + else + echo "FAIL: driver version mismatch" + exit 1 + fi + echo "" + + echo "--- GPU info ---" + nvidia-smi --query-gpu=name,memory.total,pci.bus_id --format=csv + echo "" + + echo "--- All GPUs visible ---" + GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) + echo "GPU count: ${GPU_COUNT}" + echo "" + + echo "--- CUDA compute test ---" + # Simple CUDA validation via nvidia-smi + nvidia-smi -q | grep -E "CUDA Version|Product Name|Driver Version" + echo "" + + echo "=== GPU Driver Verification Complete ===" + resource: gpu-resource diff --git a/applications/osmo/workflows/osmo/test_mnt_data.yaml b/applications/osmo/workflows/osmo/test_mnt_data.yaml new file mode 100644 index 000000000..6f3687cc4 --- /dev/null +++ b/applications/osmo/workflows/osmo/test_mnt_data.yaml @@ -0,0 +1,64 @@ +# /mnt/data Filestore Test Workflow +# Validates that the Nebius Filestore is mounted at /mnt/data and measures I/O speed. +# Uses the H100 platform (with shm template). +# +# Submit with: +# osmo workflow submit workflows/osmo/test_mnt_data.yaml + +workflow: + name: test-mnt-data + resources: + gpu-resource: + platform: gpu + gpu: 1 + cpu: 4 + memory: 4Gi + storage: 1Gi + tasks: + - name: check-mnt-data + image: ubuntu:24.04 + command: ["bash", "-c"] + args: + - | + echo "=== /mnt/data Filestore Test ===" + echo "" + + # Check mount + echo "--- Mount check ---" + if mountpoint -q /mnt/data 2>/dev/null || df /mnt/data 2>/dev/null | grep -q /mnt/data; then + echo "PASS: /mnt/data is mounted" + else + echo "FAIL: /mnt/data is NOT mounted" + exit 1 + fi + echo "" + + echo "--- Filesystem info ---" + df -hT /mnt/data + echo "" + + echo "--- Mount type ---" + mount | grep /mnt/data || echo "(not visible in mount table -- may be host mount)" + echo "" + + # Write test + echo "--- Write test (256MB) ---" + TEST_DIR="/mnt/data/.osmo-bench-$$" + mkdir -p "${TEST_DIR}" + WRITE_OUT=$(dd if=/dev/zero of="${TEST_DIR}/bench.tmp" bs=1M count=256 conv=fdatasync 2>&1) + echo "$WRITE_OUT" + echo "" + + # Read test + echo "--- Read test (256MB) ---" + READ_OUT=$(dd if="${TEST_DIR}/bench.tmp" of=/dev/null bs=1M 2>&1) + echo "$READ_OUT" + echo "" + + # Cleanup + rm -rf "${TEST_DIR}" + + echo "=== /mnt/data Filestore Test Complete ===" + resource: gpu-resource + volumeMounts: + - /mnt/data diff --git a/applications/osmo/workflows/osmo/test_multi_gpu.yaml b/applications/osmo/workflows/osmo/test_multi_gpu.yaml new file mode 100644 index 000000000..1cc7c9508 --- /dev/null +++ b/applications/osmo/workflows/osmo/test_multi_gpu.yaml @@ -0,0 +1,57 @@ +# Multi-GPU NCCL Bandwidth Test +# Requests 8 GPUs on a single H100 node and runs nccl-tests all_reduce_perf +# to validate GPU-to-GPU (NVLink/NVSwitch) communication. +# +# Uses CUDA 13.0 devel image (~4GB) to compile nccl-tests on the fly. +# +# Submit with: +# osmo workflow submit workflows/osmo/test_multi_gpu.yaml + +workflow: + name: test-multi-gpu + resources: + gpu-resource: + platform: gpu + gpu: 8 + cpu: 16 + memory: 64Gi + tasks: + - name: nccl-test + image: nvidia/cuda:13.0.2-devel-ubuntu24.04 + command: ["bash", "-c"] + args: + - | + echo "=== Multi-GPU NCCL Test (8x H100) ===" + echo "" + + echo "--- GPU topology ---" + nvidia-smi topo -m + echo "" + + echo "--- GPU summary ---" + nvidia-smi --query-gpu=index,name,memory.total --format=csv + GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) + echo "" + echo "GPUs visible: ${GPU_COUNT}" + if [ "$GPU_COUNT" -lt 8 ]; then + echo "FAIL: expected 8 GPUs, got ${GPU_COUNT}" + exit 1 + fi + echo "PASS: all 8 GPUs visible" + echo "" + + echo "--- Building nccl-tests ---" + apt-get update -qq && apt-get install -y -qq git build-essential &>/dev/null + cd /tmp + git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git &>/dev/null + cd nccl-tests + make MPI=0 -j$(nproc) &>/dev/null + echo "Built successfully" + echo "" + + echo "--- NCCL all_reduce bandwidth test ---" + ./build/all_reduce_perf -b 8M -e 1G -f 2 -g 8 + echo "" + + echo "=== Multi-GPU NCCL Test Complete ===" + resource: gpu-resource diff --git a/applications/osmo/workflows/osmo/test_shared_fs_cross_node.yaml b/applications/osmo/workflows/osmo/test_shared_fs_cross_node.yaml new file mode 100644 index 000000000..66b348a70 --- /dev/null +++ b/applications/osmo/workflows/osmo/test_shared_fs_cross_node.yaml @@ -0,0 +1,155 @@ +# Shared Filesystem Cross-Node Test +# Validates that /mnt/data is truly shared across different tasks and nodes. +# +# How it works: +# - 3 tasks run in parallel, each on a potentially different GPU node +# - Each writes a marker file with hostname + K8s node name to /mnt/data +# - After a sync delay, each reads ALL markers to verify cross-task visibility +# - The last task to finish cleans up the test directory +# +# Submit with: +# osmo workflow submit workflows/osmo/test_shared_fs_cross_node.yaml + +workflow: + name: test-shared-fs-cross-node + resources: + gpu-resource: + platform: gpu + gpu: 1 + cpu: 2 + memory: 2Gi + storage: 1Gi + tasks: + - name: node-a + image: ubuntu:24.04 + command: ["bash", "-c"] + args: + - | + set -e + TASK="node-a" + DIR="/mnt/data/.osmo-fs-test" + HOST=$(hostname) + K8S_NODE=$(cat /etc/hostname 2>/dev/null || echo "unknown") + + echo "=== Task ${TASK} ===" + echo "Pod hostname: ${HOST}" + echo "K8s node env: ${MY_NODE_NAME:-not-set}" + df -h /mnt/data + echo "" + + mkdir -p "${DIR}" + echo "${TASK}|${HOST}|$(date -Iseconds)" > "${DIR}/${TASK}.marker" + echo "Wrote: $(cat ${DIR}/${TASK}.marker)" + + # Wait for all tasks to write + echo "Waiting 30s for other tasks..." + sleep 30 + + echo "" + echo "=== All marker files ===" + FOUND=0 + for f in "${DIR}"/*.marker; do + [ -f "$f" ] || continue + FOUND=$((FOUND + 1)) + echo " $(basename $f): $(cat $f)" + done + echo "" + echo "Result: ${TASK} sees ${FOUND}/3 markers" + [ "${FOUND}" -ge 2 ] && echo "PASS" || echo "FAIL" + resource: gpu-resource + volumeMounts: + - /mnt/data + + - name: node-b + image: ubuntu:24.04 + command: ["bash", "-c"] + args: + - | + set -e + TASK="node-b" + DIR="/mnt/data/.osmo-fs-test" + HOST=$(hostname) + + echo "=== Task ${TASK} ===" + echo "Pod hostname: ${HOST}" + df -h /mnt/data + echo "" + + mkdir -p "${DIR}" + echo "${TASK}|${HOST}|$(date -Iseconds)" > "${DIR}/${TASK}.marker" + echo "Wrote: $(cat ${DIR}/${TASK}.marker)" + + echo "Waiting 30s for other tasks..." + sleep 30 + + echo "" + echo "=== All marker files ===" + FOUND=0 + for f in "${DIR}"/*.marker; do + [ -f "$f" ] || continue + FOUND=$((FOUND + 1)) + echo " $(basename $f): $(cat $f)" + done + echo "" + echo "Result: ${TASK} sees ${FOUND}/3 markers" + [ "${FOUND}" -ge 2 ] && echo "PASS" || echo "FAIL" + resource: gpu-resource + volumeMounts: + - /mnt/data + + - name: node-c + image: ubuntu:24.04 + command: ["bash", "-c"] + args: + - | + set -e + TASK="node-c" + DIR="/mnt/data/.osmo-fs-test" + HOST=$(hostname) + + echo "=== Task ${TASK} ===" + echo "Pod hostname: ${HOST}" + df -h /mnt/data + echo "" + + mkdir -p "${DIR}" + echo "${TASK}|${HOST}|$(date -Iseconds)" > "${DIR}/${TASK}.marker" + echo "Wrote: $(cat ${DIR}/${TASK}.marker)" + + echo "Waiting 30s for other tasks..." + sleep 30 + + echo "" + echo "=== All marker files ===" + FOUND=0 + HOSTS="" + for f in "${DIR}"/*.marker; do + [ -f "$f" ] || continue + FOUND=$((FOUND + 1)) + CONTENT=$(cat "$f") + echo " $(basename $f): ${CONTENT}" + H=$(echo "$CONTENT" | cut -d'|' -f2) + HOSTS="${HOSTS} ${H}" + done + NUM_UNIQUE=$(echo ${HOSTS} | tr ' ' '\n' | sort -u | grep -c . || true) + echo "" + echo "========================================" + echo "SUMMARY" + echo "========================================" + echo "Markers visible: ${FOUND}/3" + echo "Unique pods: ${NUM_UNIQUE}" + if [ "${FOUND}" -ge 3 ]; then + echo "STATUS: PASS - all tasks see all data" + elif [ "${FOUND}" -ge 2 ]; then + echo "STATUS: PASS - cross-task sharing works" + else + echo "STATUS: FAIL - data not shared" + fi + echo "========================================" + + # Cleanup (node-c runs last due to sleep timing) + sleep 5 + rm -rf "${DIR}" + resource: gpu-resource + volumeMounts: + - /mnt/data diff --git a/applications/osmo/workflows/osmo/test_shm.yaml b/applications/osmo/workflows/osmo/test_shm.yaml new file mode 100644 index 000000000..d5645ae58 --- /dev/null +++ b/applications/osmo/workflows/osmo/test_shm.yaml @@ -0,0 +1,75 @@ +# Shared Memory (/dev/shm) Test Workflow +# Validates that the shm pod template is applied correctly: +# - /dev/shm is mounted as tmpfs +# - Size is 64Gi (as configured in the shm pod template) +# - Measures sequential write/read bandwidth +# +# Requires: shm pod template configured, H100 platform with shm in override_pod_template +# +# Submit with: +# osmo workflow submit workflows/osmo/test_shm.yaml + +workflow: + name: test-shm + resources: + gpu-resource: + platform: H100 + gpu: 1 + cpu: 4 + memory: 8Gi + tasks: + - name: check-shm + image: ubuntu:24.04 + command: ["bash", "-c"] + args: + - | + echo "=== Shared Memory (/dev/shm) Test ===" + echo "" + + # Check mount + echo "--- Mount info ---" + if mount | grep -q "/dev/shm.*tmpfs"; then + echo "PASS: /dev/shm is mounted as tmpfs" + mount | grep "/dev/shm" + else + echo "FAIL: /dev/shm is NOT a tmpfs mount" + mount | grep shm || echo "(no shm mount found)" + exit 1 + fi + echo "" + + # Check size + echo "--- Size ---" + SHM_SIZE_KB=$(df -k /dev/shm | tail -1 | awk '{print $2}') + SHM_SIZE_GI=$((SHM_SIZE_KB / 1024 / 1024)) + echo "Total: ${SHM_SIZE_GI}Gi (${SHM_SIZE_KB} KB)" + if [ "$SHM_SIZE_GI" -ge 60 ]; then + echo "PASS: size >= 60Gi" + else + echo "FAIL: size ${SHM_SIZE_GI}Gi is less than expected 64Gi" + exit 1 + fi + echo "" + + # Bandwidth test - write + echo "--- Write bandwidth (1GB) ---" + WRITE_OUT=$(dd if=/dev/zero of=/dev/shm/bench.tmp bs=1M count=1024 conv=fdatasync 2>&1) + echo "$WRITE_OUT" + WRITE_SPEED=$(echo "$WRITE_OUT" | grep -oP '[\d.]+ [GM]B/s' | tail -1) + echo "Write speed: ${WRITE_SPEED:-see above}" + echo "" + + # Bandwidth test - read + echo "--- Read bandwidth (1GB) ---" + # Drop caches not possible without privileges, but tmpfs reads are from RAM anyway + READ_OUT=$(dd if=/dev/shm/bench.tmp of=/dev/null bs=1M count=1024 2>&1) + echo "$READ_OUT" + READ_SPEED=$(echo "$READ_OUT" | grep -oP '[\d.]+ [GM]B/s' | tail -1) + echo "Read speed: ${READ_SPEED:-see above}" + echo "" + + # Cleanup + rm -f /dev/shm/bench.tmp + + echo "=== Shared Memory Test Complete ===" + resource: gpu-resource From 1cf8b919617629b699599946477f8c03ca888962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Tue, 10 Mar 2026 10:18:53 +0100 Subject: [PATCH 35/37] - fix mnt data test --- .../osmo/workflows/osmo/test_mnt_data.yaml | 40 +++++++++++++++---- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/applications/osmo/workflows/osmo/test_mnt_data.yaml b/applications/osmo/workflows/osmo/test_mnt_data.yaml index 6f3687cc4..a78e0fbb5 100644 --- a/applications/osmo/workflows/osmo/test_mnt_data.yaml +++ b/applications/osmo/workflows/osmo/test_mnt_data.yaml @@ -41,18 +41,42 @@ workflow: mount | grep /mnt/data || echo "(not visible in mount table -- may be host mount)" echo "" - # Write test - echo "--- Write test (256MB) ---" + # Install fio + apt-get update -qq && apt-get install -y -qq fio > /dev/null 2>&1 + echo "fio version: $(fio --version)" + echo "" + TEST_DIR="/mnt/data/.osmo-bench-$$" mkdir -p "${TEST_DIR}" - WRITE_OUT=$(dd if=/dev/zero of="${TEST_DIR}/bench.tmp" bs=1M count=256 conv=fdatasync 2>&1) - echo "$WRITE_OUT" + + # Sequential write test + echo "--- Sequential Write (direct I/O, 1M blocks, 8 jobs) ---" + fio --name=seq-write \ + --ioengine=libaio --direct=1 --time_based \ + --directory="${TEST_DIR}" \ + --rw=write --bs=1M --iodepth=32 \ + --thread --numjobs=8 --size=2G --runtime=30 \ + --group_reporting + echo "" + + # Sequential read test + echo "--- Sequential Read (direct I/O, 1M blocks, 8 jobs) ---" + fio --name=seq-read \ + --ioengine=libaio --direct=1 --time_based \ + --directory="${TEST_DIR}" \ + --rw=read --bs=1M --iodepth=32 \ + --thread --numjobs=8 --size=2G --runtime=30 \ + --group_reporting echo "" - # Read test - echo "--- Read test (256MB) ---" - READ_OUT=$(dd if="${TEST_DIR}/bench.tmp" of=/dev/null bs=1M 2>&1) - echo "$READ_OUT" + # Random read/write (4K) for IOPS + echo "--- Random Read/Write 4K (IOPS test, 4 jobs) ---" + fio --name=rand-rw \ + --ioengine=libaio --direct=1 --time_based \ + --directory="${TEST_DIR}" \ + --rw=randrw --rwmixread=70 --bs=4k --iodepth=32 \ + --thread --numjobs=4 --size=1G --runtime=30 \ + --group_reporting echo "" # Cleanup From 8cb132013e8a5973ad0a977dd964b2364e6896e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Wed, 11 Mar 2026 11:10:47 +0100 Subject: [PATCH 36/37] - move mystery box stuff to terraform --- .../000-prerequisites/nebius-env-init.sh | 9 +- ...ets-init.sh => secrets-init_deprecated.sh} | 0 .../osmo/deploy/example/001-iac/main.tf | 4 - .../example/001-iac/modules/platform/main.tf | 121 ++++++++++++------ .../001-iac/modules/platform/outputs.tf | 8 +- .../001-iac/modules/platform/variables.tf | 37 ------ .../osmo/deploy/example/001-iac/variables.tf | 36 ------ 7 files changed, 87 insertions(+), 128 deletions(-) rename applications/osmo/deploy/example/000-prerequisites/{secrets-init.sh => secrets-init_deprecated.sh} (100%) diff --git a/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh b/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh index 850dd7e18..cd217a6d7 100755 --- a/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh +++ b/applications/osmo/deploy/example/000-prerequisites/nebius-env-init.sh @@ -258,14 +258,13 @@ main() { echo " Subnet: $subnet_name ($subnet_id)" echo "" echo "Next steps:" - echo " 1. source ./secrets-init.sh # Initialize MysteryBox secrets (recommended)" - echo " 2. cd ../001-iac" - echo " 3. cp terraform.tfvars.cost-optimized-secure.example terraform.tfvars" - echo " 4. terraform init && terraform apply" + echo " 1. cd ../001-iac" + echo " 2. cp terraform.tfvars.cost-optimized-secure.example terraform.tfvars" + echo " 3. terraform init && terraform apply" echo "" return 0 } # Run main function -main \ No newline at end of file +main diff --git a/applications/osmo/deploy/example/000-prerequisites/secrets-init.sh b/applications/osmo/deploy/example/000-prerequisites/secrets-init_deprecated.sh similarity index 100% rename from applications/osmo/deploy/example/000-prerequisites/secrets-init.sh rename to applications/osmo/deploy/example/000-prerequisites/secrets-init_deprecated.sh diff --git a/applications/osmo/deploy/example/001-iac/main.tf b/applications/osmo/deploy/example/001-iac/main.tf index 9e066ebec..461c4f28e 100755 --- a/applications/osmo/deploy/example/001-iac/main.tf +++ b/applications/osmo/deploy/example/001-iac/main.tf @@ -41,10 +41,6 @@ module "platform" { # Container Registry enable_container_registry = var.enable_container_registry container_registry_name = var.container_registry_name - - # MysteryBox secrets (optional - more secure, keeps secrets out of TF state) - postgresql_mysterybox_secret_id = var.postgresql_mysterybox_secret_id - mek_mysterybox_secret_id = var.mek_mysterybox_secret_id } # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/example/001-iac/modules/platform/main.tf b/applications/osmo/deploy/example/001-iac/modules/platform/main.tf index 10c4758bc..38af4f957 100755 --- a/applications/osmo/deploy/example/001-iac/modules/platform/main.tf +++ b/applications/osmo/deploy/example/001-iac/modules/platform/main.tf @@ -84,66 +84,103 @@ resource "nebius_compute_v1_filesystem" "shared" { } # ----------------------------------------------------------------------------- -# PostgreSQL Password (from MysteryBox - REQUIRED) +# PostgreSQL Password (generated + stored in MysteryBox) # ----------------------------------------------------------------------------- -# MysteryBox secret ID MUST be provided when using Managed PostgreSQL. -# This ensures passwords are NEVER stored in Terraform state. -# -# Setup: Run 'source ./secrets-init.sh' BEFORE 'terraform apply' -# -# Nebius PostgreSQL password requirements: -# - Min. 8 characters -# - At least one lowercase, uppercase, digit, special char EXCEPT % +# Password is generated by Terraform and stored in MysteryBox using write-only +# fields, so the password is NOT stored in MysteryBox's Terraform state. +# The random_password IS in state (marked sensitive) — acceptable since state +# is stored encrypted in Nebius Object Storage. # ----------------------------------------------------------------------------- -# Validate that MysteryBox secret is provided when PostgreSQL is enabled -resource "terraform_data" "validate_postgresql_secret" { - count = var.enable_managed_postgresql ? 1 : 0 +# Generate PostgreSQL password +resource "random_password" "postgresql" { + count = var.enable_managed_postgresql ? 1 : 0 + length = 32 + special = true + # Nebius PostgreSQL forbids % @ : / ; [ ] { } | < > , . ? + override_special = "!#$^&*()-_=+" - lifecycle { - precondition { - condition = var.postgresql_mysterybox_secret_id != null - error_message = <<-EOT - - ══════════════════════════════════════════════════════════════════════ - ERROR: PostgreSQL MysteryBox secret ID is required! - ══════════════════════════════════════════════════════════════════════ - - You must run secrets-init.sh BEFORE terraform apply: - - cd ../000-prerequisites - source ./secrets-init.sh - cd ../001-iac - terraform apply - - This creates the PostgreSQL password in MysteryBox and sets: - TF_VAR_postgresql_mysterybox_secret_id - - Without this, Terraform cannot securely configure PostgreSQL. - ══════════════════════════════════════════════════════════════════════ - EOT + keepers = { + # Password stays stable unless you taint this resource + parent_id = var.parent_id + } +} + +# Store PostgreSQL password in MysteryBox (write-only — not in TF state) +resource "nebius_mysterybox_v1_secret" "postgresql_password" { + count = var.enable_managed_postgresql ? 1 : 0 + parent_id = var.parent_id + name = "${var.name_prefix}-postgresql-password" + + sensitive = { + secret_version = { + payload = [{ + key = "password" + string_value = random_password.postgresql[0].result + }] } + version = random_password.postgresql[0].result } } -# Read password from MysteryBox (ephemeral - NOT stored in state) +# Read password back from MysteryBox (ephemeral - NOT stored in state) ephemeral "nebius_mysterybox_v1_secret_payload_entry" "postgresql_password" { - count = var.enable_managed_postgresql && var.postgresql_mysterybox_secret_id != null ? 1 : 0 - secret_id = var.postgresql_mysterybox_secret_id + count = var.enable_managed_postgresql ? 1 : 0 + secret_id = nebius_mysterybox_v1_secret.postgresql_password[0].id key = "password" } # Local to get the password from MysteryBox locals { postgresql_password = ( - !var.enable_managed_postgresql - ? null # PostgreSQL disabled - : var.postgresql_mysterybox_secret_id != null - ? ephemeral.nebius_mysterybox_v1_secret_payload_entry.postgresql_password[0].data.string_value - : null # Will fail validation above + var.enable_managed_postgresql + ? ephemeral.nebius_mysterybox_v1_secret_payload_entry.postgresql_password[0].data.string_value + : null ) } +# ----------------------------------------------------------------------------- +# MEK (Master Encryption Key) — generated + stored in MysteryBox +# ----------------------------------------------------------------------------- + +# Generate MEK +resource "random_bytes" "mek_key" { + length = 32 + + keepers = { + parent_id = var.parent_id + } +} + +locals { + mek_jwk = jsonencode({ + kty = "oct" + k = random_bytes.mek_key.base64 + alg = "A256GCM" + use = "enc" + }) + mek_json = jsonencode({ + currentMek = "key1" + meks = { key1 = base64encode(local.mek_jwk) } + }) +} + +# Store MEK in MysteryBox (write-only) +resource "nebius_mysterybox_v1_secret" "mek" { + parent_id = var.parent_id + name = "${var.name_prefix}-mek" + + sensitive = { + secret_version = { + payload = [{ + key = "mek" + string_value = local.mek_json + }] + } + version = random_bytes.mek_key.base64 + } +} + # ----------------------------------------------------------------------------- # Managed PostgreSQL (MSP) - Nebius Managed Service for PostgreSQL # Enabled by default for production-ready database service diff --git a/applications/osmo/deploy/example/001-iac/modules/platform/outputs.tf b/applications/osmo/deploy/example/001-iac/modules/platform/outputs.tf index 47225611f..01aea7674 100755 --- a/applications/osmo/deploy/example/001-iac/modules/platform/outputs.tf +++ b/applications/osmo/deploy/example/001-iac/modules/platform/outputs.tf @@ -107,13 +107,13 @@ output "postgresql_password" { } output "postgresql_mysterybox_secret_id" { - description = "MysteryBox secret ID for PostgreSQL password (if configured)" - value = var.postgresql_mysterybox_secret_id + description = "MysteryBox secret ID for PostgreSQL password" + value = var.enable_managed_postgresql ? nebius_mysterybox_v1_secret.postgresql_password[0].id : null } output "mek_mysterybox_secret_id" { - description = "MysteryBox secret ID for MEK (if configured)" - value = var.mek_mysterybox_secret_id + description = "MysteryBox secret ID for MEK" + value = nebius_mysterybox_v1_secret.mek.id } # ----------------------------------------------------------------------------- diff --git a/applications/osmo/deploy/example/001-iac/modules/platform/variables.tf b/applications/osmo/deploy/example/001-iac/modules/platform/variables.tf index 5d12973f1..a832a9d66 100755 --- a/applications/osmo/deploy/example/001-iac/modules/platform/variables.tf +++ b/applications/osmo/deploy/example/001-iac/modules/platform/variables.tf @@ -157,43 +157,6 @@ variable "postgresql_username" { default = "osmo_admin" } -# ----------------------------------------------------------------------------- -# MysteryBox Secret IDs (REQUIRED for Managed PostgreSQL) -# ----------------------------------------------------------------------------- -# MysteryBox secret ID is REQUIRED when using Managed PostgreSQL. -# This ensures passwords are NEVER stored in Terraform state. -# -# REQUIRED setup (before terraform apply): -# 1. cd deploy/000-prerequisites -# 2. source ./secrets-init.sh -# 3. cd ../001-iac && terraform apply -# -# The script sets TF_VAR_postgresql_mysterybox_secret_id automatically. -# If you forget, Terraform will fail with a clear error message. -# ----------------------------------------------------------------------------- - -variable "postgresql_mysterybox_secret_id" { - description = "MysteryBox secret ID for PostgreSQL password (REQUIRED when enable_managed_postgresql=true)" - type = string - default = null - - validation { - condition = var.postgresql_mysterybox_secret_id == null || can(regex("^mbsec-", var.postgresql_mysterybox_secret_id)) - error_message = "PostgreSQL MysteryBox secret ID must start with 'mbsec-'. Run: source ./secrets-init.sh" - } -} - -variable "mek_mysterybox_secret_id" { - description = "MysteryBox secret ID for MEK (Master Encryption Key)" - type = string - default = null - - validation { - condition = var.mek_mysterybox_secret_id == null || can(regex("^mbsec-", var.mek_mysterybox_secret_id)) - error_message = "MEK MysteryBox secret ID must start with 'mbsec-'. Run: source ./secrets-init.sh" - } -} - # ----------------------------------------------------------------------------- # Container Registry Configuration # Reference: https://docs.nebius.com/terraform-provider/reference/resources/registry_v1_registry diff --git a/applications/osmo/deploy/example/001-iac/variables.tf b/applications/osmo/deploy/example/001-iac/variables.tf index 3557341fb..cd251d6cb 100755 --- a/applications/osmo/deploy/example/001-iac/variables.tf +++ b/applications/osmo/deploy/example/001-iac/variables.tf @@ -366,42 +366,6 @@ variable "container_registry_name" { default = "" } -# ============================================================================= -# MysteryBox Secrets Configuration (REQUIRED for Managed PostgreSQL) -# ============================================================================= -# These variables MUST be set when using Managed PostgreSQL. -# Secrets are stored in MysteryBox, keeping them OUT of Terraform state. -# -# REQUIRED Setup (before terraform apply): -# 1. cd deploy/000-prerequisites -# 2. source ./secrets-init.sh -# 3. This sets TF_VAR_postgresql_mysterybox_secret_id automatically -# -# If you see validation errors, you forgot to run secrets-init.sh! -# ============================================================================= - -variable "postgresql_mysterybox_secret_id" { - description = "MysteryBox secret ID for PostgreSQL password (REQUIRED - set by secrets-init.sh)" - type = string - default = null - - validation { - condition = var.postgresql_mysterybox_secret_id == null || can(regex("^mbsec-", var.postgresql_mysterybox_secret_id)) - error_message = "PostgreSQL MysteryBox secret ID must start with 'mbsec-' (e.g., mbsec-e00xxx). Run: source ./secrets-init.sh" - } -} - -variable "mek_mysterybox_secret_id" { - description = "MysteryBox secret ID for OSMO MEK (Master Encryption Key)" - type = string - default = null - - validation { - condition = var.mek_mysterybox_secret_id == null || can(regex("^mbsec-", var.mek_mysterybox_secret_id)) - error_message = "MEK MysteryBox secret ID must start with 'mbsec-' (e.g., mbsec-e00xxx). Run: source ./secrets-init.sh" - } -} - # ============================================================================= # WireGuard VPN Configuration # ============================================================================= From 26fa51c253b6210377822aa58a7d28acc2fddd8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rene=20Sch=C3=B6nfelder?= Date: Wed, 11 Mar 2026 11:51:57 +0100 Subject: [PATCH 37/37] - fix deployment syntax for zsh as well as double check kubectl config --- .../002-setup/01-deploy-gpu-infrastructure.sh | 7 +++--- .../deploy/example/002-setup/lib/common.sh | 24 +++++++++++++++---- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh b/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh index e6a8b0578..b257ddb40 100755 --- a/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh +++ b/applications/osmo/deploy/example/002-setup/01-deploy-gpu-infrastructure.sh @@ -56,16 +56,17 @@ else kubectl create namespace "${GPU_OPERATOR_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - - DRIVER_VERSION_ARGS=() + DRIVER_VERSION_SET=() if [[ -n "${GPU_DRIVER_VERSION:-}" ]]; then log_info "Using pinned driver version: ${GPU_DRIVER_VERSION}" - DRIVER_VERSION_ARGS=(--set "driver.version=${GPU_DRIVER_VERSION}") + DRIVER_VERSION_SET=(--set "driver.version=${GPU_DRIVER_VERSION}") fi + # shellcheck disable=SC2086 helm upgrade --install gpu-operator nvidia/gpu-operator \ --namespace "${GPU_OPERATOR_NAMESPACE}" \ --values "${VALUES_DIR}/gpu-operator.yaml" \ - "${DRIVER_VERSION_ARGS[@]+"${DRIVER_VERSION_ARGS[@]}"}" \ + ${DRIVER_VERSION_SET[@]:+${DRIVER_VERSION_SET[@]}} \ --timeout 10m log_success "GPU Operator deployed (pods will become ready when GPU nodes are available)" diff --git a/applications/osmo/deploy/example/002-setup/lib/common.sh b/applications/osmo/deploy/example/002-setup/lib/common.sh index 87abfb573..17cc53533 100755 --- a/applications/osmo/deploy/example/002-setup/lib/common.sh +++ b/applications/osmo/deploy/example/002-setup/lib/common.sh @@ -138,19 +138,35 @@ wait_for_condition() { return 1 } -# Check kubectl connection +# Check kubectl connection and verify we're targeting the correct cluster check_kubectl() { if ! check_command kubectl; then log_error "kubectl not found" return 1 fi - + if ! kubectl cluster-info &>/dev/null; then log_error "Cannot connect to Kubernetes cluster" return 1 fi - - log_success "kubectl connected to cluster" + + # Verify current context matches the expected cluster from Terraform + local expected_cluster + expected_cluster=$(get_tf_output "cluster_name" "../001-iac" 2>/dev/null || true) + if [[ -n "$expected_cluster" ]]; then + local current_context + current_context=$(kubectl config current-context 2>/dev/null || true) + if [[ -n "$current_context" && "$current_context" != *"$expected_cluster"* ]]; then + log_error "Wrong Kubernetes context!" + log_error " Current context: $current_context" + log_error " Expected cluster: $expected_cluster" + log_info "Switch context with: nebius mk8s cluster get-credentials --id \$(terraform -chdir=../001-iac output -raw cluster_id) --external" + return 1 + fi + log_success "kubectl connected to cluster ($expected_cluster)" + else + log_success "kubectl connected to cluster" + fi return 0 }