Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@ ij_yaml_spaces_within_brackets = false
[*.hcl]
indent_size = 2
indent_style = space

[*.toml]
indent_size = 2
indent_style = space
54 changes: 22 additions & 32 deletions iac/modules/job-ingress/jobs/ingress.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -81,41 +81,31 @@ job "ingress" {
image = "traefik:v3.5"
ports = ["control", "ingress"]
args = [
# Entry-points that are set internally by Traefik
"--entrypoints.web.address=:${ingress_port}",
"--entrypoints.traefik.address=:${control_port}",

# Traefik internals (logging, metrics, ...)
"--api.dashboard=true",
"--api.insecure=false",

"--accesslog=true",
"--ping=true",
"--ping.entryPoint=web",
"--metrics=true",
"--metrics.otlp=true",
"--metrics.otlp.grpc=true",
"--metrics.otlp.grpc.endpoint=${otel_collector_grpc_endpoint}",
"--metrics.otlp.grpc.insecure=true",

%{ for arg in additional_args }
"${ arg }",
%{ endfor }

# Traefik Nomad provider
"--providers.nomad=true",
"--providers.nomad.exposedByDefault=false",
"--providers.nomad.endpoint.address=${nomad_endpoint}",
"--providers.nomad.endpoint.token=${nomad_token}",

# Traefik Consul provider
"--providers.consulcatalog=true",
"--providers.consulcatalog.exposedByDefault=false",
"--providers.consulcatalog.endpoint.address=${consul_endpoint}",
"--providers.consulcatalog.endpoint.token=${consul_token}",
"--configFile=/local/traefik.toml",
]
}

template {
data = <<EOF
${traefik_config}
EOF
destination = "local/traefik.toml"
}

template {
data = ""
destination = "local/config/.keep"
}

%{ for filename, content in config_files }
template {
data = <<EOF
${content}
EOF
destination = "local/config/${filename}"
}
%{ endfor }

resources {
memory_max = ${memory_mb * 1.5}
memory = ${memory_mb}
Expand Down
32 changes: 32 additions & 0 deletions iac/modules/job-ingress/jobs/traefik.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[api]
insecure = false

[entryPoints]
[entryPoints.web]
address = ":${ingress_port}"
[entryPoints.traefik]
address = ":${control_port}"

[metrics]
[metrics.otlp]
[metrics.otlp.grpc]
endpoint = "${otel_collector_grpc_endpoint}"
insecure = true

[ping]
entryPoint = "web"

[providers]
[providers.file]
directory = "/local/config"
watch = false
[providers.nomad]
exposedByDefault = false
[providers.nomad.endpoint]
address = "${nomad_endpoint}"
token = "${nomad_token}"
[providers.consulCatalog]
exposedByDefault = false
[providers.consulCatalog.endpoint]
address = "${consul_endpoint}"
token = "${consul_token}"
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Access logging silently dropped in Traefik config migration

Medium Severity

The original CLI args included --accesslog=true, but the new traefik.toml has no [accessLog] section. Traefik disables HTTP access logging by default, so this migration silently turns off access logs for all ingress traffic. Every other original CLI flag maps to an equivalent TOML setting, but accessLog is the only one missing.

Fix in Cursor Fix in Web

29 changes: 19 additions & 10 deletions iac/modules/job-ingress/main.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
locals {
traefik_config = templatefile("${path.module}/jobs/traefik.toml", {
ingress_port = var.ingress_proxy_port
control_port = var.ingress_control_port

nomad_endpoint = var.nomad_endpoint
nomad_token = var.nomad_token

consul_endpoint = var.consul_endpoint
consul_token = var.consul_token

otel_collector_grpc_endpoint = var.otel_collector_grpc_endpoint
})
}

resource "nomad_job" "ingress" {
jobspec = templatefile("${path.module}/jobs/ingress.hcl", {
count = var.ingress_count
Expand All @@ -6,16 +21,10 @@ resource "nomad_job" "ingress" {
cpu_count = var.ingress_cpu_count
memory_mb = var.ingress_memory_mb

ingress_port = var.ingress_proxy_port
control_port = var.ingress_control_port
additional_args = var.additional_traefik_arguments

nomad_endpoint = var.nomad_endpoint
nomad_token = var.nomad_token
ingress_port = var.ingress_proxy_port
control_port = var.ingress_control_port

consul_token = var.consul_token
consul_endpoint = var.consul_endpoint

otel_collector_grpc_endpoint = var.otel_collector_grpc_endpoint
traefik_config = local.traefik_config
config_files = var.traefik_config_files
})
}
5 changes: 3 additions & 2 deletions iac/modules/job-ingress/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ variable "otel_collector_grpc_endpoint" {
description = "OpenTelemetry collector gRPC endpoint (e.g., localhost:4317)"
}

variable "additional_traefik_arguments" {
type = list(string)
variable "traefik_config_files" {
type = map(string)
description = "Map of filename => content for additional Traefik dynamic configuration files"
}
6 changes: 3 additions & 3 deletions iac/provider-aws/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,9 @@ module "nomad" {
admin_token = module.init.admin_token
sandbox_access_token_hash_seed = module.init.sandbox_access_token_hash_seed

ingress_port = local.ingress_port
ingress_count = var.ingress_count
additional_traefik_arguments = var.additional_traefik_arguments
ingress_port = local.ingress_port
ingress_count = var.ingress_count
traefik_config_files = var.traefik_config_files

client_proxy_count = var.client_proxy_count
client_proxy_repository_name = module.init.client_proxy_repository_name
Expand Down
6 changes: 3 additions & 3 deletions iac/provider-aws/nomad/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ module "redis" {
module "ingress" {
source = "../../modules/job-ingress"

ingress_count = var.ingress_count
ingress_proxy_port = var.ingress_port
additional_traefik_arguments = var.additional_traefik_arguments
ingress_count = var.ingress_count
ingress_proxy_port = var.ingress_port
traefik_config_files = var.traefik_config_files

node_pool = var.api_node_pool
update_stanza = var.api_cluster_size > 1
Expand Down
5 changes: 2 additions & 3 deletions iac/provider-aws/nomad/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -309,9 +309,8 @@ variable "launch_darkly_api_key" {
sensitive = true
}

variable "additional_traefik_arguments" {
type = list(string)
default = []
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing default breaks AWS nomad module consumers

Low Severity

The old additional_traefik_arguments variable in provider-aws/nomad had default = [], making it optional. The replacement traefik_config_files has no default, making it required. Any direct caller of this module that previously relied on the implicit empty default now gets a Terraform error unless they explicitly pass traefik_config_files = {}.

Fix in Cursor Fix in Web

variable "traefik_config_files" {
type = map(string)
}

variable "db_max_open_connections" {
Expand Down
7 changes: 4 additions & 3 deletions iac/provider-aws/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,10 @@ variable "control_server_cluster_size" {
default = 3
}

variable "additional_traefik_arguments" {
type = list(string)
default = []
variable "traefik_config_files" {
type = map(string)
description = "Map of filename => content for additional Traefik dynamic configuration files"
default = {}
}

variable "db_max_open_connections" {
Expand Down
7 changes: 4 additions & 3 deletions iac/provider-gcp/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ module "cluster" {
nomad_port = var.nomad_port
google_service_account_email = module.init.service_account_email
domain_name = var.domain_name
ingress_timeout_seconds = var.ingress_timeout_seconds

additional_domains = local.additional_domains
additional_api_paths_handled_by_ingress = var.additional_api_paths_handled_by_ingress
Expand Down Expand Up @@ -208,9 +209,9 @@ module "nomad" {
clickhouse_node_pool = var.clickhouse_node_pool

# Ingress
ingress_port = var.ingress_port
ingress_count = var.ingress_count
additional_traefik_arguments = var.additional_traefik_arguments
ingress_port = var.ingress_port
ingress_count = var.ingress_count
traefik_config_files = var.traefik_config_files

# API
api_server_count = var.api_server_count
Expand Down
1 change: 1 addition & 0 deletions iac/provider-gcp/nomad-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ module "network" {
domain_name = var.domain_name
additional_domains = var.additional_domains
additional_api_paths_handled_by_ingress = var.additional_api_paths_handled_by_ingress
ingress_timeout_seconds = var.ingress_timeout_seconds

client_proxy_port = var.client_proxy_port
client_proxy_health_port = var.client_proxy_health_port
Expand Down
2 changes: 1 addition & 1 deletion iac/provider-gcp/nomad-cluster/network/ingress.tf
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ resource "google_compute_backend_service" "ingress" {
session_affinity = null
health_checks = [google_compute_health_check.ingress.id]

timeout_sec = 80
timeout_sec = var.ingress_timeout_seconds

load_balancing_scheme = "EXTERNAL_MANAGED"
locality_lb_policy = "ROUND_ROBIN"
Expand Down
14 changes: 12 additions & 2 deletions iac/provider-gcp/nomad-cluster/network/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -270,11 +270,21 @@ resource "google_compute_url_map" "orch_map" {
default_service = google_compute_backend_service.default["api"].self_link

dynamic "path_rule" {
for_each = length(var.additional_api_paths_handled_by_ingress) > 0 ? [{}] : []
for_each = var.additional_api_paths_handled_by_ingress

content {
paths = var.additional_api_paths_handled_by_ingress
paths = path_rule.value.paths
service = google_compute_backend_service.ingress.self_link

dynamic "route_action" {
for_each = path_rule.value.timeout_sec != null ? [path_rule.value.timeout_sec] : []
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The != null guard allows timeout_sec = 0 to pass through, producing timeout { seconds = 0 }. GCP rejects URL map timeouts of 0 seconds at apply time. Consider adding a variable validation rule:

validation {
  condition = alltrue([
    for r in var.additional_api_paths_handled_by_ingress :
    r.timeout_sec == null || r.timeout_sec > 0
  ])
  error_message = "timeout_sec must be greater than 0 when specified."
}

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

technically true, but not meaningfully true. gcp will happily provide an error message.


content {
timeout {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The timeout block inside route_action is missing the nanos field. While nanos is optional in the provider (defaults to 0), the GCP API may still reject the configuration for certain provider versions. Explicitly setting nanos = 0 avoids any ambiguity:

timeout {
  seconds = route_action.value
  nanos   = 0
}

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

technically true, but not meaningfully true. gcp will happily provide an error message.

seconds = route_action.value
}
}
}
}
}
}
Expand Down
9 changes: 8 additions & 1 deletion iac/provider-gcp/nomad-cluster/network/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -104,5 +104,12 @@ variable "labels" {
}

variable "additional_api_paths_handled_by_ingress" {
type = list(string)
type = list(object({
paths = list(string)
timeout_sec = optional(number)
}))
}

variable "ingress_timeout_seconds" {
type = number
}
9 changes: 8 additions & 1 deletion iac/provider-gcp/nomad-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -368,5 +368,12 @@ variable "persistent_volume_types" {
}

variable "additional_api_paths_handled_by_ingress" {
type = list(string)
type = list(object({
paths = list(string)
timeout_sec = optional(number)
}))
}

variable "ingress_timeout_seconds" {
type = number
}
6 changes: 3 additions & 3 deletions iac/provider-gcp/nomad/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ data "google_secret_manager_secret_version" "redis_tls_ca_base64" {
module "ingress" {
source = "../../modules/job-ingress"

ingress_count = var.ingress_count
ingress_proxy_port = var.ingress_port.port
additional_traefik_arguments = var.additional_traefik_arguments
ingress_count = var.ingress_count
ingress_proxy_port = var.ingress_port.port
traefik_config_files = var.traefik_config_files

node_pool = var.api_node_pool
update_stanza = var.api_machine_count > 1
Expand Down
5 changes: 3 additions & 2 deletions iac/provider-gcp/nomad/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,9 @@ variable "ingress_port" {
})
}

variable "additional_traefik_arguments" {
type = list(string)
variable "traefik_config_files" {
type = map(string)
description = "Map of filename => content for additional Traefik dynamic configuration files"
}

variable "ingress_count" {
Expand Down
23 changes: 16 additions & 7 deletions iac/provider-gcp/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -132,16 +132,14 @@ variable "ingress_count" {
}

variable "additional_api_paths_handled_by_ingress" {
type = list(string)
description = "Additional paths to forward to nomad's ingress"
type = list(object({
paths = list(string)
timeout_sec = optional(number)
}))
description = "Additional path rules to forward to nomad's ingress. Each entry creates a separate path_rule with an optional per-route timeout (overrides the ingress backend default of 80s)."
default = []
}

variable "additional_traefik_arguments" {
type = list(string)
default = []
}

variable "client_proxy_resources_memory_mb" {
type = number
default = 1024
Expand Down Expand Up @@ -704,3 +702,14 @@ variable "orchestrator_env_vars" {
type = map(string)
default = {}
}

variable "traefik_config_files" {
type = map(string)
description = "Map of filename => content for additional Traefik dynamic configuration files"
default = {}
}

variable "ingress_timeout_seconds" {
type = number
default = 80
}
Loading