Skip to content

Commit 0f260b5

Browse files
committed
Messy WIP.
1 parent 3cce817 commit 0f260b5

File tree

6 files changed

+140
-30
lines changed

6 files changed

+140
-30
lines changed

config/schema.cue

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ import "path"
5757
proxy_buffer_size: string | *"8k"
5858
proxy_buffers: string | *"8 8k"
5959
keepalive_timeout: uint | *75
60+
upstream_keepalive_connections_per_worker: uint | *10
61+
upstream_keepalive_idle_timeout: uint | *60
6062
ssl_protocols: string | *"TLSv1 TLSv1.1 TLSv1.2"
6163
ssl_ciphers: string | *"ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES128-SHA256:ECDHE-RSA-AES128-SHA256:ECDHE-ECDSA-AES128-SHA:ECDHE-RSA-AES256-SHA384:ECDHE-RSA-AES128-SHA:ECDHE-ECDSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA:ECDHE-RSA-AES256-SHA:DHE-RSA-AES128-SHA256:DHE-RSA-AES128-SHA:DHE-RSA-AES256-SHA256:DHE-RSA-AES256-SHA:ECDHE-ECDSA-DES-CBC3-SHA:ECDHE-RSA-DES-CBC3-SHA:EDH-RSA-DES-CBC3-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA256:AES256-SHA256:AES128-SHA:AES256-SHA:DES-CBC3-SHA:!DSS"
6264
ssl_session_cache: string | *"shared:ssl_sessions:50m"

src/api-umbrella/utils/active_config_store/set_envoy_config.lua

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,17 @@ local function build_cluster_resource(cluster_name, options)
9999
},
100100
},
101101
},
102+
typed_extension_protocol_options = {
103+
["envoy.extensions.upstreams.http.v3.HttpProtocolOptions"] = {
104+
["@type"] = "type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions",
105+
upstream_http_protocol_options = {
106+
auto_sni = true,
107+
},
108+
common_http_protocol_options = {
109+
idle_timeout = "1s",
110+
},
111+
},
112+
},
102113
connect_timeout = file_config["envoy"]["_connect_timeout"],
103114
upstream_connection_options = {
104115
tcp_keepalive = {
@@ -289,7 +300,7 @@ local function build_listener()
289300
stat_prefix = "router",
290301
common_http_protocol_options = {
291302
max_headers_count = 200,
292-
idle_timeout = "120s",
303+
idle_timeout = "15s",
293304
},
294305
generate_request_id = false,
295306
server_header_transformation = "PASS_THROUGH",

templates/etc/nginx/router.conf.etlua

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,8 @@ http {
194194

195195
upstream api_umbrella_trafficserver_backend {
196196
server <%- config["trafficserver"]["host"] %>:<%- config["trafficserver"]["port"] %>;
197-
keepalive 10;
197+
keepalive <%- config["nginx"]["upstream_keepalive_connections_per_worker"] %>;
198+
keepalive_timeout <%- config["nginx"]["upstream_keepalive_idle_timeout"] %>s;
198199
}
199200

200201
server {

templates/etc/trafficserver/plugin.config.etlua

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ header_rewrite.so "<%- config["etc_dir"] %>/trafficserver/header_rewrite.conf"
1111
# the same URL: https://issues.apache.org/jira/browse/TS-3431
1212
cachekey.so --include-headers=Host,X-Api-Umbrella-Backend-Host,X-Api-Umbrella-Cache-Request-Method
1313

14+
stats_over_http.so
15+
1416
<% if config["_strip_response_cookies_regex"] then %>
1517
tslua.so "<%- config["etc_dir"] %>/trafficserver/strip_response_cookies.lua" "<%- config["etc_dir"] %>/trafficserver/strip_response_cookies_regex.txt"
1618
<% end %>

templates/etc/trafficserver/records.yaml.etlua

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ records:
3131
warning: E
3232

3333
# Enable for debug logging.
34-
# debug:
35-
# enabled: 1
36-
# tags: ".*"
34+
debug:
35+
enabled: 1
36+
tags: ".*"
3737

3838
error:
3939
logfile:
@@ -122,17 +122,20 @@ records:
122122

123123
# Keepalive connections to backend Envoy server. Retain some idle
124124
# connections open to improve performance.
125-
keep_alive_no_activity_timeout_out: <%= json_encode(config["router"]["api_backends"]["keepalive_idle_timeout"]) %>
125+
keep_alive_no_activity_timeout_out: 99 # <%= json_encode(config["router"]["api_backends"]["keepalive_idle_timeout"]) %>
126+
# transaction_no_activity_timeout_out: 1
126127
per_server:
127128
connection:
128-
min: <%= json_encode(config["router"]["api_backends"]["keepalive_connections"]) %>
129+
min: 1 # <%= json_encode(config["router"]["api_backends"]["keepalive_connections"]) %>
130+
match: ip
129131

130132
# Since we're only connecting to our backend Envoy server, we can significantly
131133
# reduce connections TrafficServer tries to open by ignoring the Host for
132134
# session reuse (since Envoy will be responsible for actually connecting to the
133135
# backend servers).
134136
server_session_sharing:
135137
match: ip
138+
pool: global_locked
136139

137140
# For read_while_writer configuration in the `cache` section.
138141
background_fill_active_timeout: 0
@@ -182,12 +185,12 @@ records:
182185
# Requires enabling redirect following, which we don't want for other
183186
# responses:
184187
# https://lists.apache.org/thread.html/0eff5d9a53ef8fdf28be341f648c708bd651ad1208cddf71d532d78d@%3Cusers.trafficserver.apache.org%3E
185-
max_doc_size: 0
186-
enable_read_while_writer: 1
187-
read_while_writer:
188-
max_retries: 21
189-
read_while_writer_retry:
190-
delay: 50
188+
# max_doc_size: 0
189+
# enable_read_while_writer: 1
190+
# read_while_writer:
191+
# max_retries: 21
192+
# read_while_writer_retry:
193+
# delay: 50
191194

192195
dns:
193196
# Set the DNS nameservers used to potentially resolve a remote envoy layer.

test/proxy/keep_alive/test_server_side.rb

Lines changed: 108 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,35 @@ def setup
99
setup_server
1010
reset_api_backend_idle_connections
1111
@keepalive_idle_timeout = 2
12+
@nginx_workers = 2
13+
@nginx_upstream_keepalive_connections_per_worker = 15
14+
@router_keepalive_connections = 25
1215
once_per_class_setup do
1316
override_config_set({
17+
:nginx => {
18+
:workers => @nginx_workers,
19+
:upstream_keepalive_connections_per_worker => @nginx_upstream_keepalive_connections_per_worker,
20+
:upstream_keepalive_idle_timeout => @keepalive_idle_timeout,
21+
},
1422
:router => {
1523
:api_backends => {
1624
:keepalive_idle_timeout => @keepalive_idle_timeout,
25+
:keepalive_connections => @router_keepalive_connections,
1726
},
1827
},
1928
})
2029

2130
prepend_api_backends([
2231
{
32+
:name => unique_test_class_id,
2333
:frontend_host => "127.0.0.1",
2434
:backend_host => "127.0.0.1",
2535
:servers => [{ :host => "127.0.0.1", :port => 9444 }],
2636
:url_matches => [{ :frontend_prefix => "/#{unique_test_class_id}/keepalive-default/", :backend_prefix => "/" }],
2737
},
2838
])
2939
end
40+
@api_backend = ApiBackend.find_by!(name: unique_test_class_id)
3041
end
3142

3243
def after_all
@@ -39,7 +50,7 @@ def test_keeps_idle_connections_open
3950
# might fix things: https://github.com/apache/trafficserver/pull/8083 In
4051
# the meantime, the current behavior means idle connections perhaps stay
4152
# around too long, but I think this should be okay for now.
42-
skip("Keepalive idle handling doesn't work as expected in Traffic Server 9.1, but the behavior should still be acceptable. Revisit in Traffic Server 9.2+.")
53+
# skip("Keepalive idle handling doesn't work as expected in Traffic Server 9.1, but the behavior should still be acceptable. Revisit in Traffic Server 9.2+.")
4354

4455
assert_idle_connections("/#{unique_test_class_id}/keepalive-default/connection-stats/", $config["router"]["api_backends"]["keepalive_connections"])
4556
end
@@ -103,17 +114,24 @@ def reset_api_backend_idle_connections
103114

104115
def assert_idle_connections(path, idle_connections)
105116
# After just making one connection, sanity check the keepalive connections
106-
# to ensure it's just 1-2 (for the current connection). Keepalive
117+
# to ensure it's just few (for the current connection). Keepalive
107118
# connections are lazily established, so this just verifies the current
108119
# behavior of the connections only being kept once they're actually used.
109120
response = Typhoeus.get("http://127.0.0.1:9080#{path}", http_options)
110121
assert_response_code(200, response)
111-
data = MultiJson.load(response.body)
112-
assert_operator(data["connections_waiting"], :<=, 2)
122+
stats = connection_stats
123+
ap stats
124+
assert_includes(1..4, stats.fetch(:nginx_router_to_trafficserver_active_connections_per_trafficserver))
125+
assert_includes(1..4, stats.fetch(:trafficserver_to_envoy_active_connections_per_trafficserver))
126+
assert_includes(1..4, stats.fetch(:trafficserver_to_envoy_active_connections_per_envoy))
127+
assert_includes(1..4, stats.fetch(:envoy_to_api_backend_active_connections_per_envoy))
128+
assert_includes(1..4, stats.fetch(:envoy_to_api_backend_active_connections_per_api_backend))
129+
assert_includes(1..4, stats.fetch(:envoy_to_api_backend_idle_connections_per_api_backend))
113130

114131
# Open a bunch of concurrent connections first, and then inspect the number
115132
# of number of connections still active afterwards.
116-
hydra = Typhoeus::Hydra.new(:max_concurrency => 200)
133+
max_concurrency = 200
134+
hydra = Typhoeus::Hydra.new(max_concurrency: max_concurrency)
117135
500.times do
118136
request = Typhoeus::Request.new("http://127.0.0.1:9080#{path}", http_options)
119137
request.on_complete do |resp|
@@ -127,10 +145,32 @@ def assert_idle_connections(path, idle_connections)
127145
# bunch of idle connections open, since Traffic Server keeps these
128146
# connections around until the keepalive_idle_timeout is reached (which
129147
# we've lowered for testing purposes).
130-
response = Typhoeus.get("http://127.0.0.1:9444/connection-stats/", http_options)
131-
assert_response_code(200, response)
132-
data = MultiJson.load(response.body)
133-
assert_operator(data["connections_waiting"], :>, idle_connections + 2)
148+
stats = connection_stats
149+
ap stats
150+
assert_in_delta(@nginx_upstream_keepalive_connections_per_worker * @nginx_workers, stats.fetch(:nginx_router_to_trafficserver_active_connections_per_trafficserver), 5)
151+
assert_operator(stats.fetch(:nginx_router_to_trafficserver_active_connections_per_trafficserver), :<=, max_concurrency)
152+
assert_operator(stats.fetch(:trafficserver_to_envoy_active_connections_per_trafficserver), :>, idle_connections + 2)
153+
assert_operator(stats.fetch(:trafficserver_to_envoy_active_connections_per_trafficserver), :<=, max_concurrency)
154+
assert_operator(stats.fetch(:trafficserver_to_envoy_active_connections_per_envoy), :>, idle_connections + 2)
155+
assert_operator(stats.fetch(:trafficserver_to_envoy_active_connections_per_envoy), :<=, max_concurrency)
156+
assert_operator(stats.fetch(:envoy_to_api_backend_active_connections_per_envoy), :>, idle_connections + 2)
157+
assert_operator(stats.fetch(:envoy_to_api_backend_active_connections_per_envoy), :<=, max_concurrency)
158+
assert_operator(stats.fetch(:envoy_to_api_backend_active_connections_per_api_backend), :>, idle_connections + 2)
159+
assert_operator(stats.fetch(:envoy_to_api_backend_active_connections_per_api_backend), :<=, max_concurrency)
160+
assert_operator(stats.fetch(:envoy_to_api_backend_idle_connections_per_api_backend), :>, idle_connections + 2)
161+
assert_operator(stats.fetch(:envoy_to_api_backend_idle_connections_per_api_backend), :<=, max_concurrency)
162+
163+
300.times do
164+
request = Typhoeus::Request.new("http://127.0.0.1:9080#{path}", http_options)
165+
request.on_complete do |resp|
166+
assert_response_code(200, resp)
167+
end
168+
hydra.queue(request)
169+
end
170+
hydra.run
171+
172+
stats = connection_stats
173+
ap stats
134174

135175
# Wait for the keepalive timeout to expire, after which the number of idle
136176
# connections should be lowered to just the persistent ones that are kept
@@ -141,15 +181,30 @@ def assert_idle_connections(path, idle_connections)
141181
# but add a considerable buffer to this, since we see some some sporadic
142182
# issues where this sometimes takes longer in the test suite (but the
143183
# exact timing of this behavior isn't really that important).
144-
Timeout.timeout(@keepalive_idle_timeout + 10) do
184+
Timeout.timeout(@keepalive_idle_timeout + 300) do
145185
loop do
146-
response = Typhoeus.get("http://127.0.0.1:9444/connection-stats/", http_options)
147-
if(response.code == 200)
148-
data = MultiJson.load(response.body)
149-
if(data["connections_waiting"] <= idle_connections + 2)
150-
break
151-
end
152-
end
186+
stats = connection_stats
187+
ap stats
188+
189+
# response = Typhoeus.get("http://127.0.0.1:13001/stats?filter=downstream_cx", http_options)
190+
# if(response.code == 200)
191+
# puts response.body
192+
# end
193+
194+
# response = Typhoeus.get("http://127.0.0.1:13009/_stats/csv", http_options)
195+
# if(response.code == 200)
196+
# puts response.body
197+
# end
198+
199+
200+
# response = Typhoeus.get("http://127.0.0.1:9444/connection-stats/", http_options)
201+
# if(response.code == 200)
202+
# data = MultiJson.load(response.body)
203+
# ap data
204+
# if(data["connections_waiting"] <= idle_connections + 2)
205+
# break
206+
# end
207+
# end
153208

154209
sleep 0.1
155210
end
@@ -181,4 +236,40 @@ def assert_idle_connections(path, idle_connections)
181236
assert_operator(data["connections_waiting"], :>=, idle_connections - 1)
182237
assert_operator(data["connections_waiting"], :<=, idle_connections + 2)
183238
end
239+
240+
def connection_stats
241+
stats = {}
242+
243+
response = Typhoeus.get("http://127.0.0.1:9444/connection-stats/", http_options)
244+
assert_response_code(200, response)
245+
stats[:api_backend] = MultiJson.load(response.body)
246+
247+
response = Typhoeus.get("http://127.0.0.1:13001/stats", http_options.deep_merge({
248+
params: {
249+
format: "json",
250+
filter: "(downstream_cx|upstream_cx)",
251+
},
252+
}))
253+
assert_response_code(200, response)
254+
stats[:envoy] = MultiJson.load(response.body).fetch("stats").each_with_object({}) { |stat, data| data[stat["name"]] = stat["value"] if stat["name"] }
255+
256+
response = Typhoeus.get("http://127.0.0.1:13009/_stats", http_options)
257+
assert_response_code(200, response)
258+
stats[:trafficserver] = MultiJson.load(response.body).fetch("global").each_with_object({}) { |(key, value), data| data[key] = Integer(value, exception: false) || Float(value, exception: false) || value }
259+
260+
stats[:nginx_router_to_trafficserver_active_connections_per_trafficserver] = stats.fetch(:trafficserver).fetch("proxy.process.http.current_client_connections")
261+
stats[:trafficserver_to_envoy_active_connections_per_trafficserver] = stats.fetch(:trafficserver).fetch("proxy.process.http.current_server_connections")
262+
stats[:trafficserver_to_envoy_active_connections_per_envoy] = stats.fetch(:envoy).fetch("http.router.downstream_cx_active")
263+
stats[:envoy_to_api_backend_active_connections_per_envoy] = stats.fetch(:envoy).fetch("cluster.api-backend-cluster-#{@api_backend.id}.upstream_cx_active")
264+
stats[:envoy_to_api_backend_active_connections_per_api_backend] = stats.fetch(:api_backend).fetch("connections_active")
265+
stats[:envoy_to_api_backend_idle_connections_per_api_backend] = stats.fetch(:api_backend).fetch("connections_waiting")
266+
# assert_operator(.to_i, :>, idle_connections + 2)
267+
# assert_operator(stats.fetch(:trafficserver).fetch("proxy.process.http.current_server_connections").to_i, :>, idle_connections + 2)
268+
# assert_operator(stats.fetch(:trafficserver).fetch("proxy.process.http.pooled_server_connections").to_i, :>, idle_connections + 2)
269+
# assert_operator(stats.fetch(:envoy).fetch("http.router.downstream_cx_active"), :>, idle_connections + 2)
270+
# assert_equal(0, stats.fetch(:envoy).fetch("http.router.downstream_cx_destroy_remote"))
271+
# assert_operator(stats.fetch(:api_backend).fetch("connections_waiting"), :>, idle_connections + 2)
272+
273+
stats
274+
end
184275
end

0 commit comments

Comments
 (0)