Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
6dba685
Add support for multipe ssh keys in Soperator NFS module
alex000kim Jan 6, 2025
f789f7f
change mount path to home
alex000kim Jan 7, 2025
0670b36
Update soperator/installations/example/main.tf
alex000kim Jan 8, 2025
48fc1be
MSP-3514: fix collect logs from workers
Uburro Jan 8, 2025
bdee499
Merge branch 'release/soperator' into feature/soperator-nfs-ssh-access
alex000kim Jan 9, 2025
3bb2bac
Merge branch 'release/soperator' into MSP-3514
Uburro Jan 9, 2025
6bc29b1
Merge pull request #148 from nebius/MSP-3514
Uburro Jan 9, 2025
2ef8523
Merge pull request #144 from nebius/feature/soperator-nfs-ssh-access
alex000kim Jan 9, 2025
9ed2a82
add maintenance and use_default_apparmor_profile
Uburro Jan 10, 2025
b83df17
Merge pull request #152 from nebius/overwrite-jail
Uburro Jan 10, 2025
5ba5ca5
bump-prom-0.79.2
Uburro Jan 10, 2025
c0d7954
Bump cert-manager v1.16.2
Uburro Jan 10, 2025
b8ccbf1
Merge pull request #153 from nebius/bump-prom-0.79.2
Uburro Jan 10, 2025
0507716
[ADD] Editorconfig
dstaroff Jan 4, 2025
3e66639
[ENH] Get rid of legacy ENV usage
dstaroff Jan 4, 2025
7f7ecd1
[ENH] Allow skipping data downloading
dstaroff Jan 4, 2025
c206de6
[ENH] Put logs into result dir
dstaroff Jan 4, 2025
4deaa9c
[ADD] Support for MLFlow logging
dstaroff Jan 4, 2025
a5a3dc6
[ADD] Create results directory
dstaroff Jan 6, 2025
be537d9
[ENH] Get rid of explicit log dir
dstaroff Jan 6, 2025
b88d5cd
[ENH] Justify MLFlow tags
dstaroff Jan 6, 2025
7f75674
[ENH] Handle `cfg` not present in `params`
dstaroff Jan 6, 2025
6caeadf
[ADD] Implement custom block metrics for benchmark
dstaroff Jan 7, 2025
18664d0
[ADD] Example MLFlow env var setting script
dstaroff Jan 9, 2025
531fbf2
[ENH] Use constant seed
dstaroff Jan 10, 2025
5d20f71
[ENH] Don't wait for metrics to be sent
dstaroff Jan 10, 2025
2709925
[ADD] Introduce `samples_per_training_step` param
dstaroff Jan 10, 2025
571506c
[ADD] Introduce `timeToRun` metric
dstaroff Jan 10, 2025
df3018b
[ENH] Justify params, metrics and keys
dstaroff Jan 10, 2025
1115f53
[ADD] Support for exporting metrics from MetricsLogger
dstaroff Jan 10, 2025
e0d7216
[ADD] Implement extra metrics
dstaroff Jan 10, 2025
4aec16b
[ENH] Remove redundant step metric collection from logger + fix calcu…
dstaroff Jan 12, 2025
a063011
[ENH] Remove incorrect H200 config [MSP-3612]
dstaroff Jan 12, 2025
ce7c275
[FIX] Metric names not working during MLFlow's local file store expor…
dstaroff Jan 13, 2025
cffb6f2
[DOC] Support skipping data download jobs on init
dstaroff Jan 13, 2025
2ec819a
[DOC] Provide docs for running GPT3 benchmark with MLFlow
dstaroff Jan 13, 2025
f2c300b
[FIX] Get subnet's CIDR from status rather than from spec [MSP-3950]
dstaroff Jan 13, 2025
dfc61d4
Merge pull request #155 from nebius/dev/soperator-subnet
dstaroff Jan 13, 2025
74ae6bc
Merge pull request #146 from nebius/dev/soperator-mlperf
dstaroff Jan 13, 2025
36e1d5d
MSP-3518: add slurmdbd_config and slurm_config configurable
Uburro Jan 13, 2025
ce33407
Merge pull request #157 from nebius/MSP-3516
Uburro Jan 13, 2025
53f0c45
[ENH] Remove redundant repo downloading for SD and unify it with GPT3…
dstaroff Jan 16, 2025
598df5c
Merge pull request #160 from nebius/dev/soperator-mlperf
dstaroff Jan 16, 2025
6d337c4
added variable for region
Jan 17, 2025
e63002b
Merge pull request #163 from nebius/fix/terraform-state-region
dstaroff Jan 17, 2025
2f8cb47
update image rbac proxy
Uburro Jan 20, 2025
0f1df66
[ENH] Precise calculation of allocatable CPU & RAM
dstaroff Jan 20, 2025
d088fec
[ENH] Take less ephemeral storage for reserve
dstaroff Jan 20, 2025
7c47270
Merge pull request #169 from nebius/update-image-rbac-proxy
Uburro Jan 20, 2025
d160b03
Merge pull request #170 from nebius/dev/soperator-resources
dstaroff Jan 20, 2025
12dd86c
Merge pull request #154 from nebius/cert-manager-v1.16.2
Uburro Jan 21, 2025
9eeb273
[ADD] Introduce `region` variable [MSP-3541]
dstaroff Jan 21, 2025
01d1757
[ADD] Introduce regional support for platforms [MSP-4025]
dstaroff Jan 22, 2025
50835e2
[ADD] Check for regional support for platforms [MSP-4025]
dstaroff Jan 22, 2025
52234a7
[ADD] Check for minimal boot disk size [MSP-3632]
dstaroff Jan 22, 2025
426db6e
Merge pull request #173 from nebius/dev/soperator-region
dstaroff Jan 22, 2025
d120296
[REM] Get rid of NodePort support [MSP-4043]
dstaroff Jan 23, 2025
34b5b53
[ENH] Add tflint configs
dstaroff Jan 23, 2025
a27e710
[ADD] Provide static IP for login service [MSP-3801]
dstaroff Jan 23, 2025
a08dbb1
[FIX] Set real worker memory size as default real memory size for nod…
dstaroff Jan 24, 2025
e4d8a26
Merge pull request #176 from nebius/dev/soperator-static-ip
dstaroff Jan 24, 2025
16f6b86
MSP-4096: fix bugs with dependency mariadb-operator from monitoring s…
Uburro Jan 28, 2025
fa9e2ee
Merge pull request #178 from nebius/MSP-4096
Uburro Jan 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion modules/nfs-server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ module nfs-module {
parent_id = var.parent_id
subnet_id = var.subnet_id
ssh_user_name = var.ssh_user_name
ssh_public_key = var.ssh_public_key.key
ssh_public_keys = var.ssh_public_keys
nfs_ip_range = var.nfs_ip_range
nfs_size = var.nfs_size
}
Expand Down
4 changes: 3 additions & 1 deletion modules/nfs-server/files/nfs-cloud-init.tftpl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ users:
shell: /bin/bash
sudo: 'ALL=(ALL) NOPASSWD:ALL'
ssh-authorized-keys:
- ${ssh_public_key}
%{ for key in ssh_public_keys ~}
- ${key}
%{ endfor ~}

runcmd:
- apt-get update
Expand Down
2 changes: 1 addition & 1 deletion modules/nfs-server/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ resource "nebius_compute_v1_instance" "nfs_server" {

cloud_init_user_data = templatefile("${path.module}/files/nfs-cloud-init.tftpl", {
ssh_user_name = var.ssh_user_name,
ssh_public_key = var.ssh_public_key,
ssh_public_keys = var.ssh_public_keys,
nfs_ip_range = var.nfs_ip_range,
nfs_path = var.nfs_path,
mtu_size = var.mtu_size
Expand Down
7 changes: 4 additions & 3 deletions modules/nfs-server/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@ variable "preset" {
}

# SSH KEY
variable "ssh_public_key" {
description = "SSH public key for the 'root' user."
type = string
variable "ssh_public_keys" {
type = list(string)
description = "List of SSH public keys allowed to access the NFS server."
default = []
}

variable "instance_name" {
Expand Down
12 changes: 6 additions & 6 deletions nfs-server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,16 @@ source ./env.sh

## Usage

To use this module in your Terraform environment, you must first create a Terraform configuration, such as the file `nfs.tfvars`, with the following example content:
To use this module in your Terraform environment, you must first create a Terraform configuration, such as the file `terraform.tfvars`, with the following example content:

```hcl
parent_id = ""
subnet_id = ""
ssh_user_name = "nfs"
ssh_public_key = {
key = "put your public ssh key here"
path = "put path to ssh key here"
}
ssh_public_keys = [
"ssh-rsa AAAA...", # First user's public key
"ssh-rsa AAAA..." # Second user's public key
]
nfs_ip_range = "192.168.0.0/16"
```

Expand All @@ -31,7 +31,7 @@ run terraform:
```
terraform init
terraform plan
terraform apply -var-file nfs.tfvars
terraform apply
```

Once you have done that, you can mount on your target device using command
Expand Down
4 changes: 1 addition & 3 deletions nfs-server/locals.tf
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
locals {
ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : (
fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null)

regions_default = {
eu-west1 = {
cpu_nodes_platform = "cpu-d3"
Expand All @@ -17,4 +14,5 @@ locals {

cpu_nodes_preset = coalesce(var.cpu_nodes_preset, local.current_region_defaults.cpu_nodes_preset)
cpu_nodes_platform = coalesce(var.cpu_nodes_platform, local.current_region_defaults.cpu_nodes_platform)
ssh_public_keys = var.ssh_public_keys
}
18 changes: 9 additions & 9 deletions nfs-server/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ module "nfs-module" {
providers = {
nebius = nebius
}
source = "../modules/nfs-server"
parent_id = var.parent_id
subnet_id = var.subnet_id
ssh_user_name = var.ssh_user_name
ssh_public_key = var.ssh_public_key.key
nfs_ip_range = var.nfs_ip_range
nfs_size = var.nfs_size
platform = local.cpu_nodes_platform
preset = local.cpu_nodes_preset
source = "../modules/nfs-server"
parent_id = var.parent_id
subnet_id = var.subnet_id
ssh_user_name = var.ssh_user_name
ssh_public_keys = var.ssh_public_keys
nfs_ip_range = var.nfs_ip_range
nfs_size = var.nfs_size
platform = local.cpu_nodes_platform
preset = local.cpu_nodes_preset
}
8 changes: 4 additions & 4 deletions nfs-server/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id
# region = "" # Project region
# ssh_user_name = "" # Username you want to use to connect to the nodes
# ssh_public_key = {
# key = "put your public ssh key here" OR
# path = "put path to ssh key here"
# }
# ssh_public_keys = [
# "ssh-rsa AAAA...", # First user's public key
# "ssh-rsa AAAA..." # Second user's public key
# ]
nfs_ip_range = "192.168.0.0/16"
15 changes: 4 additions & 11 deletions nfs-server/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,10 @@ variable "nfs_size" {
description = "Size of the NFS in GB, should be divisbile by 93"
}

variable "ssh_public_key" {
description = "SSH Public Key to access the cluster nodes"
type = object({
key = optional(string),
path = optional(string, "~/.ssh/id_rsa.pub")
})
default = {}
validation {
condition = var.ssh_public_key.key != null || fileexists(var.ssh_public_key.path)
error_message = "SSH Public Key must be set by `key` or file `path` ${var.ssh_public_key.path}"
}
variable "ssh_public_keys" {
description = "List of SSH Public Keys to access the NFS server"
type = list(string)
default = []
}

variable "ssh_user_name" {
Expand Down
3 changes: 2 additions & 1 deletion soperator/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
.terraform*
installations/alexkim
installations/alexkim
installations/pikachu
5 changes: 2 additions & 3 deletions soperator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,8 @@ kubectl get pods --all-namespaces

Get the Slurm cluster IP address
```bash
export SLURM_IP=$(terraform state show module.login_script.terraform_data.connection_ip | grep 'input' | grep -oE '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | head -n 1
)
ssh root@$SLURM_IP -i ~/.ssh/<private_id_rsa_key> -p <node_port_if_not_default>
export SLURM_IP=$(terraform state show module.login_script.terraform_data.connection_ip | grep 'input' | grep -oE '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | head -n 1)
ssh root@$SLURM_IP -i ~/.ssh/<private_id_rsa_key>
```

or connect using the login script:
Expand Down
2 changes: 1 addition & 1 deletion soperator/SUBVERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1
2
13 changes: 8 additions & 5 deletions soperator/installations/example/.envrc
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
NEBIUS_TENANT_ID='tenant-...'
NEBIUS_PROJECT_ID='project-...'
NEBIUS_REGION='eu-north1'

if [ -z "${NEBIUS_TENANT_ID}" ]; then
echo "Error: NEBIUS_TENANT_ID is not set"
Expand Down Expand Up @@ -33,16 +34,18 @@ export NEBIUS_VPC_SUBNET_ID

# region TF variables

export TF_VAR_region="${NEBIUS_REGION}"
export TF_VAR_iam_token="${NEBIUS_IAM_TOKEN}"
export TF_VAR_iam_tenant_id="${NEBIUS_TENANT_ID}"
export TF_VAR_iam_project_id="${NEBIUS_PROJECT_ID}"
export TF_VAR_vpc_subnet_id="${NEBIUS_VPC_SUBNET_ID}"
export TFE_PARALLELISM=20

echo "Exported variables:"
echo "NEBIUS_TENANT_ID: ${NEBIUS_TENANT_ID}"
echo "NEBIUS_PROJECT_ID: ${NEBIUS_PROJECT_ID}"
echo "NEBIUS_VPC_SUBNET_ID: ${NEBIUS_VPC_SUBNET_ID}"
echo "TF_VAR_region: ${TF_VAR_region}"
echo "TF_VAR_iam_tenant_id: ${TF_VAR_iam_tenant_id}"
echo "TF_VAR_iam_project_id: ${TF_VAR_iam_project_id}"
echo "TF_VAR_vpc_subnet_id: ${TF_VAR_vpc_subnet_id}"
echo "TFE_PARALLELISM: ${TFE_PARALLELISM}"

# endregion TF variables
Expand Down Expand Up @@ -168,9 +171,9 @@ terraform {
key = "slurm-k8s.tfstate"

endpoints = {
s3 = "https://storage.eu-north1.nebius.cloud:443"
s3 = "https://storage.${NEBIUS_REGION}.nebius.cloud:443"
}
region = "eu-north1"
region = "${NEBIUS_REGION}"

skip_region_validation = true
skip_credentials_validation = true
Expand Down
6 changes: 6 additions & 0 deletions soperator/installations/example/.tflint.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
rule "terraform_required_providers" {
enabled = true

source = true
version = false
}
Loading