Skip to content

Commit fd1c014

Browse files
authored
Retry DNS resolution and add more DP+EP with various all2all backends examples (#897)
* Retry DNS resolution and add more DP+EP with various all2all backends examples We need to retry DNS resolution because it might fail for a relatively short period of time, this is especially evident with the new examples that use PVC (vs previously HF) where the pod startup is very fast and DNS resolution fails for a few times. Signed-off-by: Pierangelo Di Pilato <[email protected]> * Add comments to examples and configs Signed-off-by: Pierangelo Di Pilato <[email protected]> --------- Signed-off-by: Pierangelo Di Pilato <[email protected]>
1 parent 2ab754c commit fd1c014

13 files changed

+2066
-14
lines changed

config/llmisvc/config-llm-decode-worker-data-parallel.yaml

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,24 @@ spec:
7474
args:
7575
- |-
7676
# In some versions, ZMQ bind doesn't resolve the address through DNS
77-
DP_ADDRESS=$(getent hosts ${LWS_LEADER_ADDRESS} | cut -d' ' -f1)
78-
echo "DP_ADDRESS=${DP_ADDRESS}"
77+
# Retry DP_ADDRESS resolution (configurable attempts, default 30)
78+
RESOLVE_ATTEMPTS=${DP_ADDRESS_RESOLVE_ATTEMPTS:-30}
79+
for ((i=1; i<=RESOLVE_ATTEMPTS; i++)); do
80+
DP_ADDRESS=$(getent hosts ${LWS_LEADER_ADDRESS} | cut -d' ' -f1)
81+
if [ -n "$DP_ADDRESS" ]; then
82+
echo "DP_ADDRESS=${DP_ADDRESS} (resolved on attempt $i)"
83+
break
84+
else
85+
echo "DP_ADDRESS resolution failed on attempt $i, retrying..."
86+
sleep 1
87+
fi
88+
done
89+
90+
if [ -z "$DP_ADDRESS" ]; then
91+
echo "WARNING: Failed to resolve DP_ADDRESS after ${RESOLVE_ATTEMPTS} attempts, falling back to LWS_LEADER_ADDRESS"
92+
DP_ADDRESS=${LWS_LEADER_ADDRESS}
93+
echo "DP_ADDRESS=${DP_ADDRESS} (fallback)"
94+
fi
7995
8096
if [ "$KSERVE_INFER_ROCE" = "true" ]; then
8197
echo "Trying to infer RoCE configs ... "
@@ -295,8 +311,24 @@ spec:
295311
args:
296312
- |-
297313
# In some versions, ZMQ bind doesn't resolve the address through DNS
298-
DP_ADDRESS=$(getent hosts ${LWS_LEADER_ADDRESS} | cut -d' ' -f1)
299-
echo "DP_ADDRESS=${DP_ADDRESS}"
314+
# Retry DP_ADDRESS resolution (configurable attempts, default 30)
315+
RESOLVE_ATTEMPTS=${DP_ADDRESS_RESOLVE_ATTEMPTS:-30}
316+
for ((i=1; i<=RESOLVE_ATTEMPTS; i++)); do
317+
DP_ADDRESS=$(getent hosts ${LWS_LEADER_ADDRESS} | cut -d' ' -f1)
318+
if [ -n "$DP_ADDRESS" ]; then
319+
echo "DP_ADDRESS=${DP_ADDRESS} (resolved on attempt $i)"
320+
break
321+
else
322+
echo "DP_ADDRESS resolution failed on attempt $i, retrying..."
323+
sleep 1
324+
fi
325+
done
326+
327+
if [ -z "$DP_ADDRESS" ]; then
328+
echo "WARNING: Failed to resolve DP_ADDRESS after ${RESOLVE_ATTEMPTS} attempts, falling back to LWS_LEADER_ADDRESS"
329+
DP_ADDRESS=${LWS_LEADER_ADDRESS}
330+
echo "DP_ADDRESS=${DP_ADDRESS} (fallback)"
331+
fi
300332
301333
if [ "$KSERVE_INFER_ROCE" = "true" ]; then
302334
echo "Trying to infer RoCE configs ... "

config/llmisvc/config-llm-prefill-worker-data-parallel.yaml

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,24 @@ spec:
1818
args:
1919
- |-
2020
# In some versions, ZMQ bind doesn't resolve the address through DNS
21-
DP_ADDRESS=$(getent hosts ${LWS_LEADER_ADDRESS} | cut -d' ' -f1)
22-
echo "DP_ADDRESS=${DP_ADDRESS}"
21+
# Retry DP_ADDRESS resolution (configurable attempts, default 30)
22+
RESOLVE_ATTEMPTS=${DP_ADDRESS_RESOLVE_ATTEMPTS:-30}
23+
for ((i=1; i<=RESOLVE_ATTEMPTS; i++)); do
24+
DP_ADDRESS=$(getent hosts ${LWS_LEADER_ADDRESS} | cut -d' ' -f1)
25+
if [ -n "$DP_ADDRESS" ]; then
26+
echo "DP_ADDRESS=${DP_ADDRESS} (resolved on attempt $i)"
27+
break
28+
else
29+
echo "DP_ADDRESS resolution failed on attempt $i, retrying..."
30+
sleep 1
31+
fi
32+
done
33+
34+
if [ -z "$DP_ADDRESS" ]; then
35+
echo "WARNING: Failed to resolve DP_ADDRESS after ${RESOLVE_ATTEMPTS} attempts, falling back to LWS_LEADER_ADDRESS"
36+
DP_ADDRESS=${LWS_LEADER_ADDRESS}
37+
echo "DP_ADDRESS=${DP_ADDRESS} (fallback)"
38+
fi
2339
2440
if [ "$KSERVE_INFER_ROCE" = "true" ]; then
2541
echo "Trying to infer RoCE configs ... "
@@ -239,8 +255,24 @@ spec:
239255
args:
240256
- |-
241257
# In some versions, ZMQ bind doesn't resolve the address through DNS
242-
DP_ADDRESS=$(getent hosts ${LWS_LEADER_ADDRESS} | cut -d' ' -f1)
243-
echo "DP_ADDRESS=${DP_ADDRESS}"
258+
# Retry DP_ADDRESS resolution (configurable attempts, default 30)
259+
RESOLVE_ATTEMPTS=${DP_ADDRESS_RESOLVE_ATTEMPTS:-30}
260+
for ((i=1; i<=RESOLVE_ATTEMPTS; i++)); do
261+
DP_ADDRESS=$(getent hosts ${LWS_LEADER_ADDRESS} | cut -d' ' -f1)
262+
if [ -n "$DP_ADDRESS" ]; then
263+
echo "DP_ADDRESS=${DP_ADDRESS} (resolved on attempt $i)"
264+
break
265+
else
266+
echo "DP_ADDRESS resolution failed on attempt $i, retrying..."
267+
sleep 1
268+
fi
269+
done
270+
271+
if [ -z "$DP_ADDRESS" ]; then
272+
echo "WARNING: Failed to resolve DP_ADDRESS after ${RESOLVE_ATTEMPTS} attempts, falling back to LWS_LEADER_ADDRESS"
273+
DP_ADDRESS=${LWS_LEADER_ADDRESS}
274+
echo "DP_ADDRESS=${DP_ADDRESS} (fallback)"
275+
fi
244276
245277
if [ "$KSERVE_INFER_ROCE" = "true" ]; then
246278
echo "Trying to infer RoCE configs ... "

config/llmisvc/config-llm-worker-data-parallel.yaml

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,25 @@ spec:
1616
- "-c"
1717
args:
1818
- |-
19-
# In some versions, ZMP bind doesn't resolve the address through DNS
20-
DP_ADDRESS=$(getent hosts ${LWS_LEADER_ADDRESS} | cut -d' ' -f1)
21-
echo "DP_ADDRESS=${DP_ADDRESS}"
19+
# In some versions, ZMQ bind doesn't resolve the address through DNS
20+
# Retry DP_ADDRESS resolution (configurable attempts, default 30)
21+
RESOLVE_ATTEMPTS=${DP_ADDRESS_RESOLVE_ATTEMPTS:-30}
22+
for ((i=1; i<=RESOLVE_ATTEMPTS; i++)); do
23+
DP_ADDRESS=$(getent hosts ${LWS_LEADER_ADDRESS} | cut -d' ' -f1)
24+
if [ -n "$DP_ADDRESS" ]; then
25+
echo "DP_ADDRESS=${DP_ADDRESS} (resolved on attempt $i)"
26+
break
27+
else
28+
echo "DP_ADDRESS resolution failed on attempt $i, retrying..."
29+
sleep 1
30+
fi
31+
done
32+
33+
if [ -z "$DP_ADDRESS" ]; then
34+
echo "WARNING: Failed to resolve DP_ADDRESS after ${RESOLVE_ATTEMPTS} attempts, falling back to LWS_LEADER_ADDRESS"
35+
DP_ADDRESS=${LWS_LEADER_ADDRESS}
36+
echo "DP_ADDRESS=${DP_ADDRESS} (fallback)"
37+
fi
2238
2339
if [ "$KSERVE_INFER_ROCE" = "true" ]; then
2440
echo "Trying to infer RoCE configs ... "
@@ -237,9 +253,25 @@ spec:
237253
- "-c"
238254
args:
239255
- |-
240-
# In some versions, ZMP bind doesn't resolve the address through DNS
241-
DP_ADDRESS=$(getent hosts ${LWS_LEADER_ADDRESS} | cut -d' ' -f1)
242-
echo "DP_ADDRESS=${DP_ADDRESS}"
256+
# In some versions, ZMQ bind doesn't resolve the address through DNS
257+
# Retry DP_ADDRESS resolution (configurable attempts, default 30)
258+
RESOLVE_ATTEMPTS=${DP_ADDRESS_RESOLVE_ATTEMPTS:-30}
259+
for ((i=1; i<=RESOLVE_ATTEMPTS; i++)); do
260+
DP_ADDRESS=$(getent hosts ${LWS_LEADER_ADDRESS} | cut -d' ' -f1)
261+
if [ -n "$DP_ADDRESS" ]; then
262+
echo "DP_ADDRESS=${DP_ADDRESS} (resolved on attempt $i)"
263+
break
264+
else
265+
echo "DP_ADDRESS resolution failed on attempt $i, retrying..."
266+
sleep 1
267+
fi
268+
done
269+
270+
if [ -z "$DP_ADDRESS" ]; then
271+
echo "WARNING: Failed to resolve DP_ADDRESS after ${RESOLVE_ATTEMPTS} attempts, falling back to LWS_LEADER_ADDRESS"
272+
DP_ADDRESS=${LWS_LEADER_ADDRESS}
273+
echo "DP_ADDRESS=${DP_ADDRESS} (fallback)"
274+
fi
243275
244276
if [ "$KSERVE_INFER_ROCE" = "true" ]; then
245277
echo "Trying to infer RoCE configs ... "

0 commit comments

Comments
 (0)