Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CODEOWNERS
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# These owners will be the default owners for everything in
# the repo. Unless a later match takes precedence.
* @ValentaTomas @jakubno @dobrac

# AWS infrastructure
iac/provider-aws/* @sitole
22 changes: 8 additions & 14 deletions iac/provider-aws/init/secrets.tf
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,10 @@ resource "aws_secretsmanager_secret" "grafana" {
resource "aws_secretsmanager_secret_version" "grafana" {
secret_id = aws_secretsmanager_secret.grafana.id
secret_string = jsonencode({
"API_KEY" = " ",
"OTLP_URL" = " ",
"OTEL_COLLECTOR_TOKEN" = " ",
"USERNAME" = " ",
"LOGS_USER" = " ",
"LOGS_URL" = " ",
"LOGS_COLLECTOR_API_TOKEN" = " ",
"API_KEY" = " ",
"OTLP_URL" = " ",
"OTEL_COLLECTOR_TOKEN" = " ",
"USERNAME" = " ",
})

lifecycle {
Expand All @@ -82,13 +79,10 @@ locals {

output "grafana" {
value = {
api_key = local.grafana_raw["API_KEY"]
otlp_url = local.grafana_raw["OTLP_URL"]
otel_collector_token = local.grafana_raw["OTEL_COLLECTOR_TOKEN"]
username = local.grafana_raw["USERNAME"]
logs_user = local.grafana_raw["LOGS_USER"]
logs_url = local.grafana_raw["LOGS_URL"]
logs_collector_api_token = local.grafana_raw["LOGS_COLLECTOR_API_TOKEN"]
api_key = local.grafana_raw["API_KEY"]
otlp_url = local.grafana_raw["OTLP_URL"]
otel_collector_token = local.grafana_raw["OTEL_COLLECTOR_TOKEN"]
username = local.grafana_raw["USERNAME"]
}
sensitive = true
}
Expand Down
4 changes: 0 additions & 4 deletions iac/provider-aws/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,6 @@ module "nomad" {
grafana_otlp_url = module.init.grafana.otlp_url
grafana_username = module.init.grafana.username

grafana_logs_user = module.init.grafana.logs_user
grafana_logs_endpoint = module.init.grafana.logs_url
grafana_logs_api_key = module.init.grafana.logs_collector_api_token

api_node_pool = local.api_pool_name
clickhouse_node_pool = local.clickhouse_pool_name
clickhouse_jobs_prefix = local.clickhouse_jobs_prefix
Expand Down
3 changes: 3 additions & 0 deletions iac/provider-aws/modules/nodepool-client/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ locals {

RUN_CONSUL_FILE_HASH = var.setup_files_hash["run-consul"]
RUN_NOMAD_FILE_HASH = var.setup_files_hash["run-nomad"]

SET_ORCHESTRATOR_VERSION_METADATA = var.set_orchestrator_version_metadata ? "true" : "false"
NOMAD_TOKEN = var.nomad_acl_token_secret
})
}

Expand Down
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need the scripts separate?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are fairly different. I want to refactor and unify them in the future. This is just quick fix.

Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,45 @@ done
echo "- Flushing DNS caches"
resolvectl flush-caches

%{ if SET_ORCHESTRATOR_VERSION_METADATA == "true" }
# Fetch orchestrator version from Nomad variable via HTTP API (before starting Nomad client)
# This is required - the node cannot start without knowing the orchestrator version
FETCH_TIMEOUT_SECONDS=600
FETCH_INTERVAL_SECONDS=5
FETCH_MAX_ATTEMPTS=$((FETCH_TIMEOUT_SECONDS / FETCH_INTERVAL_SECONDS + 1))

echo "[Fetching orchestrator version from Nomad servers (timeout: $${FETCH_TIMEOUT_SECONDS}s)]"
ORCHESTRATOR_VERSION=""
for i in $(seq 1 $FETCH_MAX_ATTEMPTS); do
ELAPSED=$(((i - 1) * FETCH_INTERVAL_SECONDS))
NOMAD_SERVER=$(dig +short nomad.service.consul | head -1)
if [ -z "$NOMAD_SERVER" ]; then
echo "- Waiting for Consul DNS (nomad.service.consul)... ($${ELAPSED}s / $${FETCH_TIMEOUT_SECONDS}s)"
else
API_RESPONSE=$(curl -s --connect-timeout 5 --max-time 10 -H "X-Nomad-Token: ${NOMAD_TOKEN}" \
"http://$NOMAD_SERVER:4646/v1/var/nomad/jobs" 2>/dev/null)
if echo "$API_RESPONSE" | jq -e '.Items.latest_orchestrator_job_id' >/dev/null 2>&1; then
ORCHESTRATOR_VERSION=$(echo "$API_RESPONSE" | jq -r '.Items.latest_orchestrator_job_id')
echo "- Fetched orchestrator version: $ORCHESTRATOR_VERSION"
break
elif [ -n "$API_RESPONSE" ]; then
echo "- Invalid response from Nomad API, retrying... ($${ELAPSED}s / $${FETCH_TIMEOUT_SECONDS}s)"
else
echo "- No response from Nomad API at $${NOMAD_SERVER}, retrying... ($${ELAPSED}s / $${FETCH_TIMEOUT_SECONDS}s)"
fi
fi
if [ $i -eq $FETCH_MAX_ATTEMPTS ]; then
echo "- ERROR: Could not fetch orchestrator version from Nomad servers after $${FETCH_TIMEOUT_SECONDS}s"
echo "- The node cannot start without the orchestrator version. Exiting..."
exit 1
fi
sleep $FETCH_INTERVAL_SECONDS
done

/opt/nomad/bin/run-nomad.sh --client --consul-token "${CONSUL_TOKEN}" --node-pool "${NODE_POOL}" --node-labels "${NODE_LABELS}" --orchestrator-job-version "$ORCHESTRATOR_VERSION" &
%{ else }
/opt/nomad/bin/run-nomad.sh --client --consul-token "${CONSUL_TOKEN}" --node-pool "${NODE_POOL}" --node-labels "${NODE_LABELS}" &
%{ endif }

# Add alias for ssh-ing to sbx
echo '_sbx_ssh() {
Expand Down
10 changes: 10 additions & 0 deletions iac/provider-aws/modules/nodepool-client/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,13 @@ variable "scripts_path" {
description = "Path to the directory containing startup scripts. Defaults to in-module scripts."
default = ""
}

variable "set_orchestrator_version_metadata" {
description = "Whether to set orchestrator_version node metadata from Nomad variable on startup"
type = bool
}

variable "nomad_acl_token_secret" {
description = "Nomad ACL token used to fetch orchestrator version from Nomad API"
type = string
}
6 changes: 6 additions & 0 deletions iac/provider-aws/nomad-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,9 @@ module "build" {
templates_bucket_arn = data.aws_s3_bucket.templates_bucket.arn
templates_build_cache_bucket_arn = data.aws_s3_bucket.templates_build_cache_bucket.arn
custom_environments_repo_arn = data.aws_ecr_repository.custom_environments.arn

set_orchestrator_version_metadata = false
nomad_acl_token_secret = "" // node needed as we are not setting up orchestrator metadata
}

module "client" {
Expand Down Expand Up @@ -369,4 +372,7 @@ module "client" {
templates_build_cache_bucket_arn = data.aws_s3_bucket.templates_build_cache_bucket.arn
custom_environments_repo_arn = data.aws_ecr_repository.custom_environments.arn

set_orchestrator_version_metadata = true
nomad_acl_token_secret = var.nomad_acl_token_secret

}
11 changes: 9 additions & 2 deletions iac/provider-aws/nomad-cluster/scripts/run-nomad.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ function print_usage {
echo -e " --skip-nomad-config\tIf this flag is set, don't generate a Nomad configuration file. Optional. Default is false."
echo -e " --api\t\tIf set, run the Nomad agent dedicated to API. Optional. Default is false."
echo -e " --node-labels\t\tComma-separated list of scheduling labels for this node. Optional."
echo -e " --orchestrator-job-version\tThe orchestrator job version to set as node metadata. Optional."
echo
echo "Example:"
echo
Expand Down Expand Up @@ -158,7 +159,8 @@ function generate_nomad_config {
local -r user="$5"
local -r consul_token="$6"
local -r node_pool="$7"
local -r node_labels="$8"
local -r orchestrator_job_version="$8"
local -r node_labels="$9"
local -r config_path="$config_dir/$NOMAD_CONFIG_FILE"

local instance_name=""
Expand Down Expand Up @@ -197,6 +199,7 @@ client {
"node_pool" = "$node_pool"
"node_labels" = "${node_labels:-}"
${job_constraint:+"\"job_constraint\"" = "\"$job_constraint\""}
${orchestrator_job_version:+"\"orchestrator_job_version\"" = "\"$orchestrator_job_version\""}
}
max_kill_timeout = "24h"
}
Expand Down Expand Up @@ -385,6 +388,10 @@ function run {
node_labels="$2"
shift
;;
--orchestrator-job-version)
orchestrator_job_version="$2"
shift
;;
--cluster-tag-value)
assert_not_empty "$key" "$2"
cluster_tag_value="$2"
Expand Down Expand Up @@ -433,7 +440,7 @@ function run {

user=$(get_owner_of_path "$config_dir")

generate_nomad_config "$server" "$client" "$num_servers" "$config_dir" "$user" "$consul_token" "$node_pool" "$node_labels"
generate_nomad_config "$server" "$client" "$num_servers" "$config_dir" "$user" "$consul_token" "$node_pool" "$orchestrator_job_version" "$node_labels"
generate_supervisor_config "$SUPERVISOR_CONFIG_PATH" "$config_dir" "$data_dir" "$bin_dir" "$log_dir" "$user" "$use_sudo"
start_nomad

Expand Down
4 changes: 0 additions & 4 deletions iac/provider-aws/nomad/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -265,10 +265,6 @@ module "logs_collector" {

vector_health_port = var.logs_health_proxy_port
vector_api_port = var.logs_proxy_port

grafana_logs_user = var.grafana_logs_user
grafana_logs_endpoint = var.grafana_logs_endpoint
grafana_api_key = var.grafana_logs_api_key
}

# ---
Expand Down
16 changes: 0 additions & 16 deletions iac/provider-aws/nomad/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -154,22 +154,6 @@ variable "grafana_username" {
sensitive = true
}

variable "grafana_logs_user" {
type = string
default = ""
}

variable "grafana_logs_endpoint" {
type = string
default = ""
}

variable "grafana_logs_api_key" {
type = string
default = ""
sensitive = true
}

# API
variable "api_port" {
type = number
Expand Down
Loading