Another messy commit

GUI · GUI · commit 3be2fbb0bf38 · 2025-12-29T09:17:42.000-05:00
diff --git a/config/schema.cue b/config/schema.cue
@@ -146,6 +146,9 @@ import "path"
       size: string | *"256M"
     }
     records: {
+      http: {
+        keep_alive_no_activity_timeout_out: uint | *120
+      }
       cache: {
         ram_cache: {
           size: string | *"-1"
@@ -408,7 +411,6 @@ import "path"
 
   router: {
     api_backends: {
-      keepalive_connections: uint | *20
       keepalive_idle_timeout: uint | *120
     }
     trusted_proxies: [...string] | *[]
diff --git a/src/api-umbrella/utils/active_config_store/set_envoy_config.lua b/src/api-umbrella/utils/active_config_store/set_envoy_config.lua
@@ -85,11 +85,16 @@ local function build_cluster_resource(cluster_name, options)
   local resource = {
     ["@type"] = "type.googleapis.com/envoy.config.cluster.v3.Cluster",
     name = cluster_name,
-    type = "STRICT_DNS",
+    cluster_type = {
+      name = "envoy.clusters.strict_dns",
+      typed_config = {
+        ["@type"] = "type.googleapis.com/envoy.extensions.clusters.dns.v3.DnsCluster",
+        typed_dns_resolver_config = dns_resolver_config,
+        respect_dns_ttl = true,
+      },
+    },
     wait_for_warm_on_init = false,
-    typed_dns_resolver_config = dns_resolver_config,
     dns_lookup_family = dns_lookup_family,
-    respect_dns_ttl = true,
     ignore_health_on_host_removal = true,
     load_assignment = {
       cluster_name = cluster_name,
@@ -102,11 +107,22 @@ local function build_cluster_resource(cluster_name, options)
     typed_extension_protocol_options = {
       ["envoy.extensions.upstreams.http.v3.HttpProtocolOptions"] = {
         ["@type"] = "type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions",
-        upstream_http_protocol_options = {
-          auto_sni = true,
+        -- Make all API backend requests over HTTP 1.1 (instead of HTTP 2 or
+        -- 3). Since our nginx layer already downgraded any proxied requests to
+        -- HTTP 1.1 (since nginx doesn't currently support proxying via other
+        -- versions), it seems simplest to stick with that approach (even
+        -- though Envoy could proxy using a different version).
+        explicit_http_config = {
+          http_protocol_options = {},
         },
         common_http_protocol_options = {
-          idle_timeout = "1s",
+          -- Idle timeout for keepalive connections to upstream servers (API
+          -- backends).
+          --
+          -- Since API backends can be remote, keepalive connections can be
+          -- important to improving performance by keeping pre-established
+          -- connections around.
+          idle_timeout = file_config["router"]["api_backends"]["keepalive_idle_timeout"] .. "s",
         },
       },
     },
@@ -120,19 +136,16 @@ local function build_cluster_resource(cluster_name, options)
     },
   }
 
-  -- Use the "negative_ttl" time as Envoy's DNS refresh rate. Since we have
-  -- "respect_dns_ttl" enabled, successful DNS requests will use that refresh
-  -- rate instead of this one. So effectively the "dns_refresh_rate" should
-  -- only be used in failure situations, so we can use this to provide a TTL
-  -- for negative responses.
-  --
-  -- Envoy also supports the more explicit "dns_failure_refresh_rate" option,
-  -- but that includes an exponential backoff algorithm, with random jitter,
-  -- making it harder to test against. So to replicate how our "negative_ttl"
-  -- has worked under other DNS situations, we will use this "dns_refresh_rate"
-  -- (which doesn't do backoff or jitter).
+  -- Use the "negative_ttl" time as Envoy's DNS refresh rate when failures
+  -- occur. Since we have "respect_dns_ttl" enabled, successful DNS requests
+  -- will use that refresh rate instead of this one. Since this is only used in
+  -- failure situations we can use this to provide a TTL for negative
+  -- responses.
   if file_config["dns_resolver"]["negative_ttl"] then
-    resource["dns_refresh_rate"] = file_config["dns_resolver"]["negative_ttl"] .. "s"
+    resource["cluster_type"]["typed_config"]["dns_failure_refresh_rate"] = {
+      base_interval = file_config["dns_resolver"]["negative_ttl"] .. "s",
+      max_interval = file_config["dns_resolver"]["negative_ttl"] .. "s",
+    }
   end
 
   local servers
@@ -300,7 +313,17 @@ local function build_listener()
               stat_prefix = "router",
               common_http_protocol_options = {
                 max_headers_count = 200,
-                idle_timeout = "15s",
+                -- Idle timeout for keepalive connections to downstream server
+                -- (Traffic Server).
+                --
+                -- We will buffer Traffic Server's own idle timeout, since
+                -- Traffic Server should really be responsible for closing its
+                -- own connections, so this shouldn't necessarily kick in.
+                -- However, we will still add a timeout here since we've seen
+                -- cases where Traffic Server doesn't close idle connections as
+                -- expected (like if Traffic Server's
+                -- `http.per_server.connection.min` setting is set).
+                -- idle_timeout = file_config["trafficserver"]["records"]["http"]["keep_alive_no_activity_timeout_out"] + 5 .. "s"
               },
               generate_request_id = false,
               server_header_transformation = "PASS_THROUGH",
diff --git a/src/api-umbrella/utils/generate_runtime_config.lua b/src/api-umbrella/utils/generate_runtime_config.lua
@@ -387,7 +387,7 @@ local function set_computed_config(config)
   config["envoy"]["_connect_timeout"] = config["nginx"]["proxy_connect_timeout"] .. "s"
   config["envoy"]["_stream_idle_timeout"] = math.max(config["nginx"]["proxy_send_timeout"], config["nginx"]["proxy_read_timeout"]) + 2 .. "s"
   -- Disable default 15 second timeout on the entire response being returned,
-  -- since we will allow long-running streaming responses..
+  -- since we will allow long-running streaming responses.
   config["envoy"]["_route_timeout"] = "0s"
   config["trafficserver"]["_connect_attempts_timeout"] = math.min(5, config["nginx"]["proxy_connect_timeout"])
   config["trafficserver"]["_transaction_no_activity_timeout_out"] = config["nginx"]["proxy_read_timeout"]
diff --git a/tasks/deps/envoy-control-plane b/tasks/deps/envoy-control-plane
@@ -3,10 +3,10 @@
 set -e -u -x
 source ./tasks/helpers.sh
 
-envoy_control_plane_version="1.2.0"
-envoy_control_plane_hash="7add91783b597da5302aa2eb03f9e589227b9406be5f0ace087eaa62f3f5ddc2"
+envoy_control_plane_version="1.3.0"
+envoy_control_plane_hash="4f2fe25e0f5e3b6d7beaf69480e1a1882cb0c8a2e94ce649d1bce8b53fff2be3"
 if [ "$TARGETARCH" == "arm64" ]; then
-  envoy_control_plane_hash="6aec5ed0f25b22c781eeeb1901b19d524a4b4c6eb20695b5069ee7d412d1be8b"
+  envoy_control_plane_hash="01553e98c4abafd108fa77f70d5a79f41acf46f5ca590a6dd193170edc3eb12d"
 fi
 
 task_working_dir
diff --git a/tasks/outdated.thor b/tasks/outdated.thor
@@ -26,6 +26,7 @@ class Outdated < Thor
     },
     "envoy_control_plane" => {
       :git => "https://github.com/GUI/envoy-control-plane.git",
+      :github_release => "GUI/envoy-control-plane",
     },
     "fluent_bit" => {
       :git => "https://github.com/fluent/fluent-bit.git",
diff --git a/templates/etc/envoy/envoy.yaml.etlua b/templates/etc/envoy/envoy.yaml.etlua
@@ -13,9 +13,12 @@ typed_dns_resolver_config:
 static_resources:
   clusters:
     - name: api-umbrella-cluster
-      type: LOGICAL_DNS
+      cluster_type:
+        name: envoy.clusters.logical_dns
+        typed_config:
+          "@type": type.googleapis.com/envoy.extensions.clusters.dns.v3.DnsCluster
+          respect_dns_ttl: true
       dns_lookup_family: V4_PREFERRED
-      respect_dns_ttl: true
       ignore_health_on_host_removal: true
       load_assignment:
         cluster_name: api-umbrella-cluster
diff --git a/templates/etc/nginx/router.conf.etlua b/templates/etc/nginx/router.conf.etlua
@@ -263,6 +263,20 @@ http {
         }
       <% end %>
 
+      <% if config["_test_env?"] then %>
+        location = /_nginx_status {
+          add_header Content-Type application/json;
+          return 200 '{
+            "connection": $connection,
+            "connection_requests": $connection_requests,
+            "connections_active": $connections_active,
+            "connections_reading": $connections_reading,
+            "connections_waiting": $connections_waiting,
+            "connections_writing": $connections_writing
+          }';
+        }
+      <% end %>
+
       location / {
         include ./gatekeeper.conf;
       }
diff --git a/templates/etc/trafficserver/plugin.config.etlua b/templates/etc/trafficserver/plugin.config.etlua
@@ -11,8 +11,9 @@ header_rewrite.so "<%- config["etc_dir"] %>/trafficserver/header_rewrite.conf"
 # the same URL:  https://issues.apache.org/jira/browse/TS-3431
 cachekey.so --include-headers=Host,X-Api-Umbrella-Backend-Host,X-Api-Umbrella-Cache-Request-Method
 
-stats_over_http.so
-
 <% if config["_strip_response_cookies_regex"] then %>
   tslua.so "<%- config["etc_dir"] %>/trafficserver/strip_response_cookies.lua" "<%- config["etc_dir"] %>/trafficserver/strip_response_cookies_regex.txt"
 <% end %>
+
+# Provide endpoint for Traffic Server stats.
+stats_over_http.so
diff --git a/templates/etc/trafficserver/records.yaml.etlua b/templates/etc/trafficserver/records.yaml.etlua
@@ -31,9 +31,9 @@ records:
       warning: E
 
     # Enable for debug logging.
-    debug:
-      enabled: 1
-      tags: ".*"
+    # debug:
+    #   enabled: 1
+    #   tags: ".*"
 
   error:
     logfile:
@@ -120,22 +120,16 @@ records:
     # Don't override the "Server" response header with the TrafficServer signature.
     response_server_enabled: 0
 
-    # Keepalive connections to backend Envoy server. Retain some idle
-    # connections open to improve performance.
-    keep_alive_no_activity_timeout_out: 99 # <%= json_encode(config["router"]["api_backends"]["keepalive_idle_timeout"]) %>
-    # transaction_no_activity_timeout_out: 1
-    per_server:
-      connection:
-        min: 1 # <%= json_encode(config["router"]["api_backends"]["keepalive_connections"]) %>
-        match: ip
+    # How long to keep idle keepalive connections to backend Envoy server
+    # around.
+    keep_alive_no_activity_timeout_out: <%- json_encode(config["trafficserver"]["records"]["http"]["keep_alive_no_activity_timeout_out"]) %>
 
     # Since we're only connecting to our backend Envoy server, we can significantly
     # reduce connections TrafficServer tries to open by ignoring the Host for
     # session reuse (since Envoy will be responsible for actually connecting to the
     # backend servers).
     server_session_sharing:
       match: ip
-      pool: global_locked
 
     # For read_while_writer configuration in the `cache` section.
     background_fill_active_timeout: 0
@@ -185,12 +179,12 @@ records:
     #   Requires enabling redirect following, which we don't want for other
     #   responses:
     #   https://lists.apache.org/thread.html/0eff5d9a53ef8fdf28be341f648c708bd651ad1208cddf71d532d78d@%3Cusers.trafficserver.apache.org%3E
-    # max_doc_size: 0
-    # enable_read_while_writer: 1
-    # read_while_writer:
-    #   max_retries: 21
-    # read_while_writer_retry:
-    #   delay: 50
+    max_doc_size: 0
+    enable_read_while_writer: 1
+    read_while_writer:
+      max_retries: 21
+    read_while_writer_retry:
+      delay: 50
 
   dns:
     # Set the DNS nameservers used to potentially resolve a remote envoy layer.
diff --git a/test/proxy/keep_alive/test_server_side.rb b/test/proxy/keep_alive/test_server_side.rb
diff --git a/test/support/api_umbrella_test_helpers/setup.rb b/test/support/api_umbrella_test_helpers/setup.rb