Skip to content

Commit 614c44c

Browse files
authored
fix: nacos service discovery request lacks retries after failure (#12734)
1 parent 575f217 commit 614c44c

File tree

2 files changed

+145
-46
lines changed

2 files changed

+145
-46
lines changed

apisix/discovery/nacos/init.lua

Lines changed: 98 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ local core = require('apisix.core')
2222
local ipairs = ipairs
2323
local pairs = pairs
2424
local type = type
25-
local math = math
2625
local math_random = math.random
2726
local ngx = ngx
2827
local ngx_re = require('ngx.re')
@@ -164,10 +163,7 @@ local function get_signed_param(group_name, service_name)
164163
end
165164

166165

167-
local function get_base_uri()
168-
local host = local_conf.discovery.nacos.host
169-
-- TODO Add health check to get healthy nodes.
170-
local url = host[math_random(#host)]
166+
local function build_base_uri(url)
171167
local auth_idx = core.string.rfind_char(url, '@')
172168
local username, password
173169
if auth_idx then
@@ -195,6 +191,18 @@ local function get_base_uri()
195191
end
196192

197193

194+
local function get_base_uri_by_index(index)
195+
local host = local_conf.discovery.nacos.host
196+
197+
local url = host[index]
198+
if not url then
199+
return nil
200+
end
201+
202+
return build_base_uri(url)
203+
end
204+
205+
198206
local function de_duplication(services, namespace_id, group_name, service_name, scheme)
199207
for _, service in ipairs(services) do
200208
if service.namespace_id == namespace_id and service.group_name == group_name
@@ -277,69 +285,114 @@ local function is_grpc(scheme)
277285
end
278286

279287
local curr_service_in_use = {}
280-
local function fetch_full_registry(premature)
281-
if premature then
282-
return
283-
end
284288

285-
local base_uri, username, password = get_base_uri()
289+
290+
local function fetch_from_host(base_uri, username, password, services)
286291
local token_param, err = get_token_param(base_uri, username, password)
287292
if err then
288-
log.error('get_token_param error:', err)
289-
return
293+
return false, err
290294
end
291295

292-
local infos = get_nacos_services()
293-
if #infos == 0 then
294-
return
295-
end
296296
local service_names = {}
297-
for _, service_info in ipairs(infos) do
298-
local data, err
297+
local nodes_cache = {}
298+
local had_success = false
299+
300+
for _, service_info in ipairs(services) do
299301
local namespace_id = service_info.namespace_id
300302
local group_name = service_info.group_name
301303
local scheme = service_info.scheme or ''
302-
local namespace_param = get_namespace_param(service_info.namespace_id)
303-
local group_name_param = get_group_name_param(service_info.group_name)
304-
local signature_param = get_signed_param(service_info.group_name, service_info.service_name)
304+
local namespace_param = get_namespace_param(namespace_id)
305+
local group_name_param = get_group_name_param(group_name)
306+
local signature_param = get_signed_param(group_name, service_info.service_name)
305307
local query_path = instance_list_path .. service_info.service_name
306308
.. token_param .. namespace_param .. group_name_param
307309
.. signature_param
308-
data, err = get_url(base_uri, query_path)
309-
if err then
310-
log.error('get_url:', query_path, ' err:', err)
311-
goto CONTINUE
312-
end
310+
local data, req_err = get_url(base_uri, query_path)
311+
if req_err then
312+
log.error('failed to fetch instances for service [', service_info.service_name,
313+
'] from ', base_uri, ', error: ', req_err)
314+
else
315+
had_success = true
316+
317+
local key = get_key(namespace_id, group_name, service_info.service_name)
318+
service_names[key] = true
313319

314-
local nodes = {}
315-
local key = get_key(namespace_id, group_name, service_info.service_name)
316-
service_names[key] = true
317-
for _, host in ipairs(data.hosts) do
318-
local node = {
319-
host = host.ip,
320-
port = host.port,
321-
weight = host.weight or default_weight,
322-
}
323-
-- docs: https://github.com/yidongnan/grpc-spring-boot-starter/pull/496
324-
if is_grpc(scheme) and host.metadata and host.metadata.gRPC_port then
325-
node.port = host.metadata.gRPC_port
320+
local hosts = data.hosts
321+
if type(hosts) ~= 'table' then
322+
hosts = {}
326323
end
327324

328-
core.table.insert(nodes, node)
329-
end
330-
if #nodes > 0 then
331-
local content = core.json.encode(nodes)
332-
nacos_dict:set(key, content)
325+
local nodes = {}
326+
for _, host in ipairs(hosts) do
327+
local node = {
328+
host = host.ip,
329+
port = host.port,
330+
weight = host.weight or default_weight,
331+
}
332+
-- docs: https://github.com/yidongnan/grpc-spring-boot-starter/pull/496
333+
if is_grpc(scheme) and host.metadata and host.metadata.gRPC_port then
334+
node.port = host.metadata.gRPC_port
335+
end
336+
337+
core.table.insert(nodes, node)
338+
end
339+
340+
if #nodes > 0 then
341+
nodes_cache[key] = nodes
342+
end
333343
end
334-
::CONTINUE::
335344
end
336-
-- remove services that are not in use anymore
345+
346+
if not had_success then
347+
return false, 'all nacos services fetch failed'
348+
end
349+
350+
for key, nodes in pairs(nodes_cache) do
351+
local content = core.json.encode(nodes)
352+
nacos_dict:set(key, content)
353+
end
354+
337355
for key, _ in pairs(curr_service_in_use) do
338356
if not service_names[key] then
339357
nacos_dict:delete(key)
340358
end
341359
end
360+
342361
curr_service_in_use = service_names
362+
return true
363+
end
364+
365+
366+
local function fetch_full_registry(premature)
367+
if premature then
368+
return
369+
end
370+
371+
local infos = get_nacos_services()
372+
if #infos == 0 then
373+
return
374+
end
375+
376+
local host_list = local_conf.discovery.nacos.host
377+
local host_count = #host_list
378+
local start = math_random(host_count)
379+
380+
for i = 0, host_count - 1 do
381+
local idx = (start + i - 1) % host_count + 1
382+
local base_uri, username, password = get_base_uri_by_index(idx)
383+
384+
if not base_uri then
385+
log.warn('nacos host at index ', idx, ' is invalid, skip')
386+
else
387+
local ok, err = fetch_from_host(base_uri, username, password, infos)
388+
if ok then
389+
return
390+
end
391+
log.error('fetch_from_host: ', base_uri, ' err:', err)
392+
end
393+
end
394+
395+
log.error('failed to fetch nacos registry from all hosts')
343396
end
344397

345398

t/discovery/nacos2.t

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ GET /hello
8383
--- response_body_like eval
8484
qr/server [1-2]/
8585
--- error_log
86-
err:status = 502
86+
error: status = 502
8787
8888
8989
@@ -340,3 +340,49 @@ discovery:
340340
}
341341
--- response_body
342342
2
343+
344+
345+
346+
=== TEST 6: fallback to next nacos host when current host fails
347+
--- yaml_config
348+
apisix:
349+
node_listen: 1984
350+
deployment:
351+
role: data_plane
352+
role_data_plane:
353+
config_provider: yaml
354+
discovery:
355+
nacos:
356+
host:
357+
- "http://127.0.0.1:20998"
358+
- "http://127.0.0.1:8858"
359+
prefix: "/nacos/v1/"
360+
fetch_interval: 1
361+
weight: 1
362+
timeout:
363+
connect: 2000
364+
send: 2000
365+
read: 5000
366+
--- apisix_yaml
367+
routes:
368+
-
369+
uri: /hello
370+
upstream:
371+
service_name: APISIX-NACOS
372+
discovery_type: nacos
373+
type: roundrobin
374+
#END
375+
--- http_config
376+
server {
377+
listen 20998;
378+
379+
location / {
380+
return 502;
381+
}
382+
}
383+
--- request
384+
GET /hello
385+
--- response_body_like eval
386+
qr/server [1-2]/
387+
--- error_log
388+
fetch_from_host: http://127.0.0.1:20998/nacos/v1/ err:all nacos services fetch failed

0 commit comments

Comments
 (0)