feat(wstransport): support concurrent accept#1919
Conversation
🏁 Performance SummaryCommit:
📊 View Container Resources in the Workflow SummaryLatency History🔵 Min • 🟢 Avg • 🔴 Max %%{init: {"xyChart": {"width": 500, "height": 200}}}%%
xychart-beta
title "TCP"
x-axis "PR Number" 1599 --> 1922
y-axis "Latency (ms)"
line "Min" [0.315, 0.429, 0.308, 0.349, 0.397, 0.259, 0.317, 0.333, 0.266, 0.278, 0.322, 0.300, 0.301, 0.302, 0.263, 0.338, 0.288, 0.308, 0.318, 0.313, 0.358, 0.317, 0.274, 0.293, 0.278, 0.312, 0.308, 0.280, 0.296, 0.292, 0.296, 0.296, 0.437, 0.265, 0.444, 0.396, 0.323, 0.305, 0.305, 0.293, 0.333, 0.322, 0.370, 0.273, 0.338, 0.275, 0.318, 0.322, 0.405, 0.278, 0.325, 0.297, 0.275, 0.297, 0.280, 0.277, 0.407, 0.304, 0.329, 0.319, 0.372, 0.289, 0.317, 0.262, 0.298, 0.376, 0.331, 0.257, 0.286, 0.299, 0.267, 0.530, 0.277, 0.240, 0.266, 0.424, 0.235, 0.261, 0.364, 0.422, 0.274, 0.297, 0.313, 0.291, 0.317, 0.356, 0.274, 0.222, 0.290, 0.253, 0.319, 0.268, 0.303, 0.246, 0.304, 0.316, 0.217, 0.252, 0.431, 0.247, 0.291, 0.267, 0.297, 0.392, 0.267, 0.335, 0.447, 0.289, 0.251, 0.361, 0.400, 0.393, 0.323, 0.267, 0.357, 0.284, 0.266, 0.319, 0.284, 0.299, 0.340, 0.323, 0.239, 0.300, 0.323, 0.339, 0.274, 0.272, 0.342, 0.594, 0.341, 0.254, 0.492, 0.404, 0.499, 0.418, 0.565, 0.343, 0.537, 0.486, 0.487, 0.497, 0.532, 0.582, 0.544, 0.482, 0.502, 0.464, 0.463, 0.510, 0.496, 0.424, 0.363, 0.437, 0.505, 0.484, 0.449, 0.453, 0.648, 0.572, 0.494, 0.498, 0.466, 0.528, 0.407, 0.503, 0.424, 0.473, 0.380, 0.535, 0.423, 0.493, 0.465, 0.439, 0.428, 0.467, 0.492, 0.461, 0.458, 0.469, 0.513, 0.532, 0.414, 0.379, 0.462, 0.462, 0.458, 0.466, 0.450, 0.432, 0.457, 0.424, 0.502, 0.420, 0.515, 0.448, 0.438, 0.503, 0.542, 0.392, 0.516, 0.465, 0.457, 0.470, 0.529, 0.501, 0.409, 0.485, 0.377, 0.392, 0.555, 0.369, 0.320, 0.528, 0.448, 0.508, 0.428, 0.433, 0.549, 0.599, 0.549]
line "Avg" [1.042, 1.134, 1.015, 1.035, 1.177, 0.899, 1.040, 1.085, 0.860, 1.015, 1.020, 0.945, 0.986, 1.074, 0.899, 0.975, 0.958, 1.027, 0.959, 1.047, 1.103, 1.046, 0.908, 0.882, 0.896, 1.064, 1.011, 0.957, 0.991, 0.923, 0.867, 0.959, 1.126, 0.910, 1.125, 1.085, 0.983, 1.049, 1.053, 1.056, 1.080, 0.989, 1.137, 1.007, 1.062, 0.874, 1.065, 1.137, 1.321, 0.852, 1.080, 0.991, 1.003, 1.017, 0.971, 1.068, 1.026, 1.054, 1.100, 1.054, 1.087, 0.961, 0.886, 0.878, 1.028, 1.247, 1.023, 0.887, 1.015, 0.952, 0.875, 1.716, 1.003, 0.997, 0.982, 1.178, 0.804, 0.839, 1.092, 1.110, 0.850, 1.026, 0.996, 0.959, 1.046, 1.083, 0.907, 0.852, 0.933, 0.953, 1.103, 0.932, 1.033, 0.974, 0.901, 1.042, 0.807, 0.838, 1.146, 1.110, 0.967, 0.981, 0.902, 1.065, 0.980, 1.053, 1.187, 0.955, 0.857, 1.106, 1.137, 1.130, 1.215, 1.003, 1.135, 0.933, 0.939, 0.997, 0.910, 0.949, 1.081, 1.009, 0.860, 1.047, 0.958, 0.999, 1.007, 0.870, 1.585, 1.622, 1.666, 0.923, 1.652, 1.383, 1.573, 1.415, 1.903, 1.399, 1.464, 1.422, 1.637, 1.594, 1.636, 1.531, 1.644, 1.674, 1.504, 1.422, 1.780, 1.681, 1.519, 1.252, 1.365, 1.539, 1.592, 1.591, 1.708, 1.391, 1.909, 1.760, 1.483, 1.432, 1.549, 1.699, 1.448, 1.533, 1.597, 1.353, 1.221, 1.833, 1.523, 1.475, 1.355, 1.485, 1.827, 1.522, 1.602, 1.429, 1.607, 1.528, 1.480, 1.731, 1.834, 1.308, 1.853, 1.399, 1.696, 1.611, 1.365, 1.297, 1.469, 1.585, 1.775, 1.401, 1.732, 1.582, 1.506, 1.410, 1.592, 1.339, 1.465, 1.328, 1.384, 1.545, 1.439, 1.657, 1.779, 1.433, 1.428, 2.023, 1.749, 1.435, 1.495, 1.427, 1.421, 1.478, 1.434, 1.243, 1.510, 1.617, 1.605]
line "Max" [2.298, 2.513, 2.647, 2.370, 2.486, 2.258, 2.316, 2.407, 2.065, 3.120, 2.328, 2.193, 2.359, 2.398, 2.082, 2.241, 2.339, 2.187, 2.267, 2.268, 2.480, 2.591, 2.231, 1.950, 2.493, 2.596, 2.270, 2.198, 2.587, 2.539, 1.811, 2.355, 2.373, 2.516, 2.368, 2.412, 2.166, 2.482, 2.401, 2.505, 2.396, 2.211, 2.686, 2.876, 2.527, 2.543, 2.441, 2.495, 3.568, 2.073, 2.354, 2.257, 2.217, 2.562, 2.383, 3.300, 2.337, 2.394, 2.521, 2.316, 2.490, 2.352, 2.218, 2.233, 2.267, 2.836, 2.352, 2.316, 2.371, 2.306, 1.972, 3.568, 2.457, 2.517, 2.295, 2.685, 1.995, 1.988, 2.495, 2.309, 2.154, 2.482, 2.433, 2.387, 2.387, 2.335, 2.411, 2.018, 2.145, 2.515, 2.441, 2.129, 2.243, 2.354, 2.225, 2.359, 2.232, 2.120, 3.212, 3.323, 2.256, 2.348, 2.230, 2.484, 2.177, 2.245, 2.542, 2.492, 2.067, 2.443, 2.400, 2.524, 2.732, 2.678, 2.575, 2.320, 2.945, 2.775, 2.175, 2.225, 2.827, 2.412, 2.462, 2.354, 2.303, 2.117, 2.404, 2.228, 3.568, 3.422, 3.568, 2.286, 3.568, 3.568, 3.568, 3.112, 3.568, 3.568, 3.349, 3.582, 3.568, 3.568, 3.568, 3.211, 3.568, 3.568, 3.401, 3.568, 3.568, 3.568, 3.568, 2.844, 3.133, 3.568, 3.568, 3.568, 3.568, 3.568, 3.568, 6.031, 3.568, 3.255, 3.568, 3.568, 3.540, 3.694, 3.568, 3.391, 2.740, 3.568, 3.568, 3.568, 3.214, 3.373, 3.568, 3.568, 3.568, 3.161, 3.568, 3.568, 4.087, 3.568, 3.568, 3.568, 3.568, 3.399, 3.569, 3.568, 3.367, 2.849, 3.604, 3.568, 3.568, 3.568, 3.286, 3.568, 3.366, 2.914, 3.014, 2.823, 3.114, 2.886, 3.056, 3.270, 3.034, 3.211, 3.230, 3.017, 2.801, 3.696, 3.013, 2.960, 2.848, 3.011, 3.047, 2.944, 2.795, 2.632, 3.259, 3.297, 3.314]
%%{init: {"xyChart": {"width": 500, "height": 200}}}%%
xychart-beta
title "QUIC"
x-axis "PR Number" 1685 --> 1922
y-axis "Latency (ms)"
line "Min" [0.000, 0.659, 0.000, 0.325, 0.229, 0.000, 0.000, 0.208, 0.223, 0.000, 0.000, 0.000, 0.000, 0.271, 0.203, 0.186, 0.000, 0.219, 0.254, 0.000, 0.000, 0.338, 0.000, 0.000, 0.207, 0.000, 0.221, 0.229, 0.000, 0.322, 0.000, 0.205, 0.000, 0.368, 0.213, 0.000, 0.187, 0.263, 0.179, 0.000, 0.323, 0.000, 0.000, 0.000, 0.000, 0.169, 0.197, 0.000, 0.306, 0.000, 0.000, 0.000, 0.448, 0.000, 0.242, 0.423, 0.000, 0.357, 0.392, 0.000, 0.000, 0.404, 0.364, 0.000, 0.000, 0.424, 0.000, 0.000, 0.000, 0.359, 0.449, 0.296, 0.364, 0.286, 0.000, 0.396, 0.000, 0.336, 0.369, 0.381, 0.370, 0.420, 0.451, 0.407, 0.000, 0.483, 0.583, 0.000, 0.588, 0.528, 0.572, 0.220, 0.368, 0.507, 0.549, 0.387, 0.559, 0.571, 0.609, 0.637, 0.441, 0.491, 0.609, 0.636, 0.586, 0.512, 0.412, 0.535, 0.548, 0.582, 0.592, 0.390, 0.571, 0.660, 0.582, 0.497, 0.401, 0.621, 0.589, 0.441, 0.651, 0.563, 0.471, 0.596, 0.600, 0.622, 0.537, 0.688, 0.491, 0.526, 0.646, 0.470, 0.518, 0.477, 0.515, 0.418, 0.571, 0.555, 0.687, 0.495, 0.537, 0.596, 0.575, 0.594]
line "Avg" [1.069, 1.707, 0.259, 1.020, 0.856, 0.725, 0.602, 0.893, 0.875, 0.661, 0.557, 0.731, 0.796, 0.949, 0.726, 0.849, 0.761, 0.718, 0.939, 0.550, 0.561, 0.943, 0.832, 0.807, 0.867, 0.633, 1.098, 0.735, 0.608, 1.008, 0.530, 0.823, 0.822, 1.001, 0.919, 0.789, 0.785, 0.862, 0.741, 0.384, 0.935, 0.539, 0.664, 0.689, 0.829, 0.677, 0.918, 0.625, 0.880, 0.554, 0.506, 0.520, 1.456, 0.838, 0.873, 1.195, 0.824, 1.267, 1.176, 0.885, 0.661, 1.160, 1.118, 0.990, 1.148, 1.215, 1.187, 1.033, 1.098, 1.268, 1.200, 1.277, 1.181, 1.244, 0.863, 1.411, 0.948, 1.204, 1.203, 1.323, 1.341, 1.366, 1.242, 1.218, 1.116, 1.650, 1.700, 1.018, 1.666, 1.618, 1.699, 1.034, 1.254, 1.552, 1.605, 1.509, 1.613, 1.720, 1.646, 1.781, 1.639, 1.600, 1.780, 1.830, 1.749, 1.636, 1.531, 1.607, 1.667, 1.720, 1.716, 1.414, 1.555, 1.760, 1.690, 1.590, 1.385, 1.644, 1.727, 1.561, 1.827, 1.725, 1.688, 1.626, 1.608, 1.643, 1.608, 1.725, 1.557, 1.668, 1.690, 1.509, 1.661, 1.564, 1.614, 1.552, 1.697, 1.667, 1.724, 1.713, 1.525, 1.757, 1.692, 1.810]
line "Max" [2.783, 6.605, 0.821, 1.944, 1.917, 1.681, 1.573, 1.837, 1.857, 1.782, 1.274, 1.783, 2.676, 2.006, 1.943, 2.179, 1.918, 1.755, 1.959, 1.371, 1.455, 1.958, 2.546, 1.978, 1.997, 1.535, 2.434, 1.934, 1.541, 2.256, 1.587, 1.879, 1.830, 2.144, 1.966, 1.840, 1.831, 1.849, 1.714, 1.796, 2.375, 1.343, 1.580, 1.815, 2.349, 1.672, 1.771, 1.586, 1.984, 1.678, 1.404, 1.476, 3.783, 2.333, 2.056, 2.873, 2.073, 2.809, 2.498, 2.435, 1.792, 2.426, 2.214, 2.181, 2.954, 2.497, 2.517, 2.223, 2.517, 2.560, 2.569, 2.510, 2.509, 2.615, 2.074, 4.512, 2.230, 2.542, 2.512, 3.394, 2.811, 3.227, 2.571, 2.708, 2.375, 3.946, 4.238, 2.346, 4.349, 4.111, 4.722, 2.563, 2.672, 3.922, 4.296, 3.873, 4.155, 4.365, 4.428, 4.471, 4.182, 4.108, 4.544, 4.572, 4.385, 3.944, 4.006, 4.238, 4.119, 4.910, 4.201, 4.154, 3.851, 4.208, 3.972, 4.180, 4.207, 3.611, 4.619, 5.194, 3.998, 3.792, 3.652, 3.594, 3.831, 3.703, 3.748, 3.740, 3.660, 3.663, 3.740, 3.650, 3.712, 3.473, 3.397, 3.419, 3.935, 3.699, 3.789, 3.732, 3.390, 3.787, 3.899, 4.072]
|
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #1919 +/- ##
==========================================
+ Coverage 87.61% 87.66% +0.05%
==========================================
Files 282 282
Lines 43460 43605 +145
Branches 14 14
==========================================
+ Hits 38078 38227 +149
+ Misses 5382 5378 -4
🚀 New features to boost your workflow:
|
| concurrentAcceptsPerHttpServer: | ||
| if concurrentAcceptsPerHttpServer <= 0: | ||
| DefaultConcurrentAcceptsPerHttpServer | ||
| else: | ||
| concurrentAcceptsPerHttpServer, |
There was a problem hiding this comment.
if this logic is added there why is it not added to other new proc?
my main concern is if this logic, for fallback value, is really needed then it should be in other proc as well.
| if self.acceptFuts.len <= 0: | ||
| let res = await self.acceptResults.popFirst() | ||
|
|
||
| if res.isErr: |
There was a problem hiding this comment.
hmmm... i would still prefer to use same construct for handling error. that is using try-expect instead of if-of. i can understand why code was made like this, but i would prefer for all error handling across codebase to be try-expect instead of if .. of ...
pseudocode:
# early return if there is value
if not res.isErr:
return res.value
# handle errors
try:
raise res.error
except WebSocketError:
except:
except:
except:would this be possible?
| try: | ||
| self.acceptResults.addLastNoWait(res) | ||
| except AsyncQueueFullError: | ||
| raise newException(AssertionDefect, "wstransport handshakeResults queue full") |
There was a problem hiding this comment.
| try: | |
| self.acceptResults.addLastNoWait(res) | |
| except AsyncQueueFullError: | |
| raise newException(AssertionDefect, "wstransport handshakeResults queue full") | |
| try: | |
| self.acceptResults.addLastNoWait(res) | |
| except AsyncQueueFullError: | |
| raiseAssert "acceptResults queue is full" |
if accepted please search and updated other places...
reasoning: raiseAssert also raises AssertionDefect. libp2p across codebase mainly uses raiseAsserts
| try: | ||
| self.acceptResults.addLastNoWait(AcceptResult.ok(conn)) | ||
| except AsyncQueueFullError: | ||
| await noCancel req.stream.closeWait() |
| try: | ||
| finished = await one(self.acceptFuts) | ||
| except ValueError as exc: | ||
| raise newException(AssertionDefect, "wstransport accept error: " & exc.msg, exc) |
|
|
||
| asyncSpawn self.handshakeWorker(finished, httpServer.secure) | ||
|
|
||
| await sleepAsync(0) # be nice |
There was a problem hiding this comment.
is this really necessary? would be interesting to here reasoning here
| try: | ||
| let wstransp = await self.wsserver.handleRequest(req).wait(self.handshakeTimeout) | ||
| let conn = await self.connHandler(wstransp, secure, Direction.In) | ||
| try: | ||
| self.acceptResults.addLastNoWait(AcceptResult.ok(conn)) | ||
| except AsyncQueueFullError: | ||
| await noCancel req.stream.closeWait() | ||
| except CatchableError as exc: | ||
| await noCancel req.stream.closeWait() | ||
| try: | ||
| self.acceptResults.addLastNoWait(AcceptResult.err(exc)) | ||
| except AsyncQueueFullError: | ||
| discard | ||
| except CatchableError as exc: | ||
| try: | ||
| self.acceptResults.addLastNoWait(AcceptResult.err(exc)) | ||
| except AsyncQueueFullError: | ||
| discard |
There was a problem hiding this comment.
please improve this logic:
- use specific errors instead of
CatchableError - do not use nasted error handling (
try-exceptwithintry-except)
for example:
let wstransp =
try
await self.wsserver.handleRequest(req).wait(self.handshakeTimeout)
except ....
let conn =
try
await self.connHandler(wstransp, secure, Direction.In)
except ....
| try: | ||
| var finished: Future[HttpRequest] | ||
| try: |
| MultiAddress.init(remoteAddr).tryGet() & codec.tryGet(), | ||
| MultiAddress.init(localAddr).tryGet() & codec.tryGet(), | ||
| ) | ||
| except CatchableError as exc: |
There was a problem hiding this comment.
please use specific error instead of CatchableError.
| @@ -37,6 +37,7 @@ const | |||
| DefaultHeadersTimeout = 3.seconds | |||
There was a problem hiding this comment.
CI on windows: for some reason fails
There was a problem hiding this comment.
Will investigate. It's hanging non-deterministically in a [Suite ] WebSocket transport test:
[Test ] server with multiple parallel connections
Terminate batch job (Y/N)?
^C
Error: The operation was canceled.
...that should take 2-3 seconds every time:
[Test ] server with multiple parallel connections
@[0, 1, 2, 3, 4, 4, 2, 0, 3, 1, 4, 3, 0, 1, 2, 4, 3, 0, 1, 2, 4, 1, 0, 3, 2, 1, 4, 0, 3, 2, 1, 0, 4, 3, 4, 1, 2, 0, 2, 1, 3, 4, 1, 2, 3, 0, 4, 1, 2, 0, 1, 3, 4, 2, 0, 4, 2, 0, 3, 1, 4, 1, 2, 3, 0, 2, 4, 0, 2, 1, 0, 3, 4, 2, 1, 4, 0, 2, 3, 1, 3, 2, 0, 4, 1, 2, 3, 0, 4, 2, 1, 3, 0, 1, 4, 2, 1, 4, 0, 3, 4, 0, 1, 2, 4, 3, 3, 0, 1, 2, 4, 2, 3, 0, 1, 2, 4, 3, 4, 0, 2, 3, 1, 2, 3, 4, 1, 0, 2, 3, 4, 1, 3, 4, 2, 0, 3, 1, 4, 0, 2, 2, 3, 1, 4, 0, 3, 0, 2, 4, 1, 3, 0, 1, 4, 3, 1, 3, 0, 0]
[OK ] ( 2.62s) server with multiple parallel connections
There was a problem hiding this comment.
If we add 300ms of delay between client connection attempts it doesn't hang, specifically on Windows and on Nim 2.2.6. There is a nonzero chance this is a Chronos + Nim 2.2.6 + Windows bug or some combination thereof. It doesn't seem to hang on the Nim 2.0.16 + Windows CI. I'll investigate further.
There was a problem hiding this comment.
if we run client, then wait it to finish (with 300m delay) we will serialize all client connections, which avoids purpose of test which is to run may client connections in parallel.
if you need to test via ci, you can run only ws tests in ci so you don't wait too much time for test to finish.
to do this, change test/test_all.nim and just import ws tests here:
import ./libp2p/transports/test_ws
There was a problem hiding this comment.
if we run client, then wait it to finish (with 300m delay) we will serialize all client connections, which avoids purpose of test which is to run may client connections in parallel.
Yes, I was just abusing the CI to quickly test some theories on Windows.
if you need to test via ci, you can run only ws tests in ci so you don't wait too much time for test to run. to do this, change
test/test_all.nimand just import ws tests :import ./libp2p/transports/test_ws
Thanks!
If I need to do more Windows testing I'll set it up on a Windows machine. I didn't think it was something serious, but now I think it is not a quick fix.
Sorry for the noise. And there's a good chance that this nim-libp2p PR is dead, in any case.
There was a problem hiding this comment.
consider testing status-im/nim-websock#180 - add test case that asserts concurrent accepts
There was a problem hiding this comment.
fyi: similar test failed on macos-15-arm64 (Nim v2.2.6) "master" (technically not master but code just adds logs)
https://github.com/vacp2p/nim-libp2p/actions/runs/19856566195/job/56895878119?pr=1924
There was a problem hiding this comment.
Thank you very much! Yes, I think this patch/PR is too risky. I'll submit an alternative one that doesn't have this problem.
|
|
||
| # Wait for the stream handler to complete before closing | ||
| await serverStreamHandlerFuts[handlerIndex] | ||
| echo "[DEBUG] serverHandler after await serverStreamHandlerFuts ", handlerIndex |
There was a problem hiding this comment.
[nph] reported by reviewdog 🐶
| echo "[DEBUG] serverHandler after await serverStreamHandlerFuts ", handlerIndex | |
| echo "[DEBUG] serverHandler after await serverStreamHandlerFuts ", | |
| handlerIndex |
|
This PR can be closed in favor of #1937 |
NOTE: This is a draft PR; it depends on status-im/nim-websock#180 and addresses bug logos-messaging/logos-delivery#3634.
Rationale
nim-websock's
HttpServer.acceptproc, which is used by nim-libp2p's websocket client acceptor, does two things:If a client sends an incomplete HTTP request,
HttpServer.acceptwill starve stream socket acceptance for the full duration of headersTimeout / handshakeTimeout (3 seconds as configured by nim-libp2p). That manifests as a buildup in the Recv-Q of the accept socket as reported in the nwaku bug.Proposed solution
This PR is a follow-up and a replacement to this unmerged PR in nim-websock: status-im/nim-websock#179. It's moving the proposed fix one level up from nim-websock and into nim-libp2p.
The proposed solution spawns a connection accept dispatcher task as part of WsTransport start. The accept dispatcher maintains a configured number of pending
HttpServer.acceptfutures on each of the WsTransport's httpservers, handling handshakes asynchronously to prevent blocking the accept loop.Discussion
As @arnetheduck points out, this could be solved indirectly if we shift to a different architecture. But meanwhile, we can time out header parsing with some configured level of concurrency by spawning N tasks to do
HttpServer.acceptas previously suggested, which solves immediate service degradation due to the implicit serialization of websocket connections and header parsing timeouts.