Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 0 additions & 42 deletions soperator/installations/example/.envrc
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@ NEBIUS_TENANT_ID="$NEBIUS_TENANT_ID" # ='tenant-...'
NEBIUS_PROJECT_ID="$NEBIUS_PROJECT_ID" # ='project-...'
NEBIUS_REGION="${NEBIUS_REGION:-eu-north1}"

# O11y setup.
NEBIUS_OLLY_PROFILE="${NEBIUS_OLLY_PROFILE:-soperator-telemetry}"
NEBIUS_OLLY_TENANT_ID="${NEBIUS_OLLY_TENANT_ID:-tenant-e00vyb5y1x5vqkzw5e}" # ='tenant-...'

if [ -z "${NEBIUS_TENANT_ID}" ]; then
echo "Error: NEBIUS_TENANT_ID is not set"
return 1
Expand All @@ -16,37 +12,6 @@ if [ -z "${NEBIUS_PROJECT_ID}" ]; then
return 1
fi

# region IAM token

unset NEBIUS_IAM_TOKEN
nebius iam whoami > /dev/null
nebius iam get-access-token > /dev/null
NEBIUS_IAM_TOKEN=$(nebius iam get-access-token)
export NEBIUS_IAM_TOKEN

if [ -f "$HOME/.nebius/credentials.yaml" ]; then
IAM_TOKEN_EXPIRES_AT=$(yq '.tokens[].expires_at' "$HOME/.nebius/credentials.yaml" 2>/dev/null)
if [ -n "$IAM_TOKEN_EXPIRES_AT" ]; then
if [[ "$(uname)" == "Darwin" ]]; then
echo "IAM token expires at: $(date -r "$IAM_TOKEN_EXPIRES_AT")"
else
echo "IAM token expires at: $(date -d @"$IAM_TOKEN_EXPIRES_AT")"
fi
fi
fi

# endregion IAM token

# region VPC subnet

NEBIUS_VPC_SUBNET_ID=$(nebius vpc subnet list \
--parent-id "${NEBIUS_PROJECT_ID}" \
--format json \
| jq -r '.items[0].metadata.id')
export NEBIUS_VPC_SUBNET_ID

# endregion VPC subnet

# region Remote state

# region Service account
Expand Down Expand Up @@ -209,12 +174,8 @@ EOF
# region TF variables

export TF_VAR_region="${NEBIUS_REGION}"
export TF_VAR_iam_token="${NEBIUS_IAM_TOKEN}"
export TF_VAR_iam_tenant_id="${NEBIUS_TENANT_ID}"
export TF_VAR_iam_project_id="${NEBIUS_PROJECT_ID}"
export TF_VAR_o11y_iam_tenant_id="${NEBIUS_OLLY_TENANT_ID}"
export TF_VAR_o11y_profile="${NEBIUS_OLLY_PROFILE}"
export TF_VAR_vpc_subnet_id="${NEBIUS_VPC_SUBNET_ID}"
export TF_VAR_aws_access_key_id="${AWS_ACCESS_KEY_ID}"
export TF_VAR_aws_secret_access_key="${AWS_SECRET_ACCESS_KEY}"
export TFE_PARALLELISM=20
Expand All @@ -223,9 +184,6 @@ echo "Exported variables:"
echo "TF_VAR_region: ${TF_VAR_region}"
echo "TF_VAR_iam_tenant_id: ${TF_VAR_iam_tenant_id}"
echo "TF_VAR_iam_project_id: ${TF_VAR_iam_project_id}"
echo "TF_VAR_o11y_iam_tenant_id: ${TF_VAR_o11y_iam_tenant_id}"
echo "TF_VAR_o11y_profile: ${TF_VAR_o11y_profile}"
echo "TF_VAR_vpc_subnet_id: ${TF_VAR_vpc_subnet_id}"
echo "TF_VAR_aws_access_key_id: ${TF_VAR_aws_access_key_id}"
echo "TFE_PARALLELISM: ${TFE_PARALLELISM}"

Expand Down
20 changes: 10 additions & 10 deletions soperator/installations/example/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ module "nfs-server" {
module "cleanup" {
source = "../../modules/cleanup"

iam_project_id = var.iam_project_id
iam_project_id = local.iam_project_id
}

module "k8s_cleanup" {
Expand Down Expand Up @@ -291,9 +291,9 @@ module "o11y" {

source = "../../modules/o11y"

iam_project_id = var.iam_project_id
o11y_iam_tenant_id = var.o11y_iam_tenant_id
o11y_profile = var.o11y_profile
iam_project_id = local.iam_project_id
o11y_iam_tenant_id = local.o11y_iam_tenant_id
o11y_profile = local.o11y_profile
k8s_cluster_context = module.k8s.cluster_context
company_name = var.company_name
}
Expand All @@ -310,9 +310,9 @@ module "slurm" {

active_checks_scope = var.active_checks_scope

region = var.region
iam_tenant_id = var.iam_tenant_id
iam_project_id = var.iam_project_id
region = local.region
iam_tenant_id = data.nebius_iam_v1_tenant.this.id
iam_project_id = data.nebius_iam_v1_project.this.id
cluster_name = var.company_name
name = local.slurm_cluster_name
k8s_cluster_context = module.k8s.cluster_context
Expand Down Expand Up @@ -525,7 +525,7 @@ module "backups_store" {

source = "../../modules/backups_store"

iam_project_id = var.iam_project_id
iam_project_id = local.iam_project_id
instance_name = local.k8s_cluster_name

cleanup_bucket_on_destroy = var.cleanup_bucket_on_destroy
Expand All @@ -544,8 +544,8 @@ module "backups" {
k8s_cluster_context = module.k8s.cluster_context
k8s_cluster_id = module.k8s.cluster_id

iam_project_id = var.iam_project_id
iam_tenant_id = var.iam_tenant_id
iam_project_id = data.nebius_iam_v1_project.this.id
iam_tenant_id = data.nebius_iam_v1_tenant.this.id
instance_name = local.k8s_cluster_name
soperator_namespace = local.slurm_cluster_name
backups_password = var.backups_password
Expand Down
47 changes: 43 additions & 4 deletions soperator/installations/example/terraform.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,17 @@ terraform {
source = "hashicorp/helm"
version = "<3.0.0"
}

external = {
source = "hashicorp/external"
version = ">= 2.3.0"
}
}
}

provider "nebius" {
domain = "api.eu.nebius.cloud:443"
domain = "api.eu.nebius.cloud:443"
profile = {}
}

provider "units" {}
Expand All @@ -44,22 +50,55 @@ provider "string-functions" {}
provider "kubernetes" {
host = module.k8s.control_plane.public_endpoint
cluster_ca_certificate = module.k8s.control_plane.cluster_ca_certificate
token = var.iam_token
exec {
api_version = "client.authentication.k8s.io/v1beta1"
command = "nebius"
args = [
"mk8s",
"v1",
"cluster",
"get-token",
"--format",
"json",
]
}
}

provider "flux" {
kubernetes = {
host = module.k8s.control_plane.public_endpoint
cluster_ca_certificate = module.k8s.control_plane.cluster_ca_certificate
token = var.iam_token
exec = {
api_version = "client.authentication.k8s.io/v1beta1"
command = "nebius"
args = [
"mk8s",
"v1",
"cluster",
"get-token",
"--format",
"json",
]
}
}
}

provider "helm" {
kubernetes {
host = module.k8s.control_plane.public_endpoint
cluster_ca_certificate = module.k8s.control_plane.cluster_ca_certificate
token = var.iam_token
exec {
api_version = "client.authentication.k8s.io/v1beta1"
command = "nebius"
args = [
"mk8s",
"v1",
"cluster",
"get-token",
"--format",
"json",
]
}
}
}

Expand Down
74 changes: 38 additions & 36 deletions soperator/installations/example/variables.tf
Original file line number Diff line number Diff line change
@@ -1,73 +1,65 @@
# region Cloud

data "external" "env" {
program = ["jq", "--null-input", "env | { NEBIUS_PROJECT_ID, NEBIUS_OLLY_PROFILE, NEBIUS_OLLY_TENANT_ID }"]
}
locals {
region = coalesce(var.region, data.nebius_iam_v1_project.this.region)
iam_project_id = coalesce(var.iam_project_id, data.external.env.result.NEBIUS_PROJECT_ID)
o11y_profile = coalesce(var.o11y_profile, data.external.env.result.NEBIUS_OLLY_PROFILE, "soperator-telemetry")
o11y_iam_tenant_id = coalesce(var.o11y_iam_tenant_id, data.external.env.result.NEBIUS_OLLY_TENANT_ID, "tenant-e00vyb5y1x5vqkzw5e")
}

variable "region" {
description = "Region of the project."
type = string
nullable = false
default = null
}
resource "terraform_data" "check_region" {
lifecycle {
precondition {
condition = contains(module.resources.regions, var.region)
error_message = "Unknown region '${var.region}'. See https://docs.nebius.com/overview/regions"
condition = var.region == null || contains(module.resources.regions, var.region)
error_message = "Unknown region '${var.region != null ? var.region : ""}'. See https://docs.nebius.com/overview/regions"
}
}
}

variable "iam_token" {
description = "IAM token used for communicating with Nebius services."
type = string
nullable = false
sensitive = true
}

variable "iam_project_id" {
description = "ID of the IAM project."
type = string
nullable = false
default = null

validation {
condition = startswith(var.iam_project_id, "project-")
condition = var.iam_project_id == null || startswith(var.iam_project_id, "project-")
error_message = "ID of the IAM project must start with `project-`."
}
}
data "nebius_iam_v1_project" "this" {
id = var.iam_project_id
}

variable "iam_tenant_id" {
description = "ID of the IAM tenant."
type = string
nullable = false

validation {
condition = startswith(var.iam_tenant_id, "tenant-")
error_message = "ID of the IAM tenant must start with `tenant-`."
}
id = local.iam_project_id
}

data "nebius_iam_v1_tenant" "this" {
id = var.iam_tenant_id
id = data.nebius_iam_v1_project.this.parent_id
}

variable "o11y_iam_tenant_id" {
description = "ID of the IAM tenant for O11y."
type = string
nullable = false
default = null

validation {
condition = startswith(var.o11y_iam_tenant_id, "tenant-")
condition = var.o11y_iam_tenant_id == null || startswith(var.o11y_iam_tenant_id, "tenant-")
error_message = "ID of the IAM tenant must start with `tenant-`."
}
}

variable "o11y_profile" {
description = "Profile for nebius CLI for public o11y."
type = string
nullable = false
default = null

validation {
condition = (
condition = var.o11y_profile == null || (
(length(var.o11y_profile) >= 1 && var.public_o11y_enabled) ||
!var.public_o11y_enabled
)
Expand Down Expand Up @@ -97,14 +89,24 @@ If you provision a NON-PRODUCTION cluster, set "production" variable to false.
variable "vpc_subnet_id" {
description = "ID of VPC subnet."
type = string
nullable = true
default = null

validation {
condition = startswith(var.vpc_subnet_id, "vpcsubnet-")
condition = var.vpc_subnet_id == null || startswith(var.vpc_subnet_id, "vpcsubnet-")
error_message = "The ID of the VPC subnet must start with `vpcsubnet-`."
}
}

data "external" "default_vpc_subnet" {
program = ["bash", "-euo", "pipefail", "-c", <<-BASH
nebius vpc subnet list --parent-id "$0" --format json | jq -r '.items[0].metadata | { id }'
BASH
, data.nebius_iam_v1_project.this.id]
}

data "nebius_vpc_v1_subnet" "this" {
id = var.vpc_subnet_id
id = var.vpc_subnet_id != null ? var.vpc_subnet_id : data.external.default_vpc_subnet.result.id
}

variable "slurm_login_public_ip" {
Expand Down Expand Up @@ -420,10 +422,10 @@ resource "terraform_data" "check_nfs" {

precondition {
condition = (var.nfs.enabled
? contains(module.resources.platform_regions[var.nfs.resource.platform], var.region)
? contains(module.resources.platform_regions[var.nfs.resource.platform], local.region)
: true
)
error_message = "Unsupported platform '${var.nfs.resource.platform}' in region '${var.region}'. See https://docs.nebius.com/compute/virtual-machines/types"
error_message = "Unsupported platform '${var.nfs.resource.platform}' in region '${local.region}'. See https://docs.nebius.com/compute/virtual-machines/types"
}
}
}
Expand Down Expand Up @@ -938,8 +940,8 @@ resource "terraform_data" "check_slurm_nodeset" {
}

precondition {
condition = contains(module.resources.platform_regions[each.value.resource.platform], var.region)
error_message = "Unsupported platform '${each.value.resource.platform}' in region '${var.region}'. See https://docs.nebius.com/compute/virtual-machines/types"
condition = contains(module.resources.platform_regions[each.value.resource.platform], local.region)
error_message = "Unsupported platform '${each.value.resource.platform}' in region '${local.region}'. See https://docs.nebius.com/compute/virtual-machines/types"
}

# TODO: precondition for total node group count
Expand All @@ -957,7 +959,7 @@ resource "terraform_data" "check_local_nvme" {
alltrue([
for worker in var.slurm_nodeset_workers :
!try(worker.local_nvme.enabled, false) || (
try(module.resources.local_nvme_supported_by_region_platform_preset[var.region][worker.resource.platform][worker.resource.preset], false)
try(module.resources.local_nvme_supported_by_region_platform_preset[local.region][worker.resource.platform][worker.resource.preset], false)
)
])
)
Expand Down
Loading