Skip to content

Commit 80ae8e9

Browse files
authored
Added throughput benchmark example (#72)
* Windows fix * Added throughput benchmark example * Added throughput documentation * Added native latencies * performance latency test with websocket conditional compilation * Added latency documentation * Updated performance docs
1 parent 0bafb31 commit 80ae8e9

File tree

7 files changed

+559
-69
lines changed

7 files changed

+559
-69
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,5 @@ httparse = "1.3.5"
4646
doc-comment = "0.3"
4747

4848
[[bench]]
49-
name = "performance"
49+
name = "latency"
5050
harness = false

README.md

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,10 @@ You could change the transport of your application in literally one line.
5555
"One thread to rule them all".
5656
- Easy error handling:
5757
do not deal with dark internal `std::io::Error` when send/receive from the network.
58-
- High performance:
59-
- Non-blocking sockets: scale the application without wasting memory and time synchonizing
60-
multiple threads.
58+
- High performance (see the [benchmarks](docs/performance_benchmarks.md)):
6159
- Write/read messages with zero-copy.
6260
You write and read directly from the internal OS socket buffer without any copy in the middle by the library.
63-
- Full duplex: simultaneous reading/writing operations over same internal OS sockets.
61+
- Full duplex: simultaneous reading/writing operations over same internal OS socket.
6462
- Customizable: `message-io` doesn't have the transport you need?
6563
Add easily and [adapter](#custom-adapter).
6664

benches/latency.rs

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
use message_io::network::{self, Transport, NetworkController, NetworkProcessor, Endpoint};
2+
use message_io::util::thread::{NamespacedThread};
3+
use message_io::util::encoding::{self, Decoder, MAX_ENCODED_SIZE};
4+
5+
use criterion::{criterion_group, criterion_main, Criterion};
6+
7+
#[cfg(feature = "websocket")]
8+
use tungstenite::{Message, connect as ws_connect, accept as ws_accept};
9+
use url::{Url};
10+
11+
use std::net::{TcpListener, TcpStream, UdpSocket};
12+
use std::time::{Duration};
13+
use std::sync::{
14+
Arc,
15+
atomic::{AtomicBool, Ordering},
16+
};
17+
use std::io::{Write, Read};
18+
19+
lazy_static::lazy_static! {
20+
static ref TIMEOUT: Duration = Duration::from_millis(100);
21+
}
22+
23+
fn init_connection(transport: Transport) -> (NetworkController, NetworkProcessor, Endpoint) {
24+
let (controller, mut processor) = network::split();
25+
26+
let running = Arc::new(AtomicBool::new(true));
27+
let mut thread = {
28+
let running = running.clone();
29+
NamespacedThread::spawn("perf-listening", move || {
30+
while running.load(Ordering::Relaxed) {
31+
processor.process_poll_event(Some(*TIMEOUT), |_| ());
32+
}
33+
processor
34+
})
35+
};
36+
37+
let receiver_addr = controller.listen(transport, "127.0.0.1:0").unwrap().1;
38+
let receiver = controller.connect(transport, receiver_addr).unwrap().0;
39+
40+
running.store(false, Ordering::Relaxed);
41+
let processor = thread.join();
42+
// From here, the connection is performed independently of the transport used
43+
44+
(controller, processor, receiver)
45+
}
46+
47+
fn latency_by(c: &mut Criterion, transport: Transport) {
48+
let msg = format!("latency by {}", transport);
49+
c.bench_function(&msg, |b| {
50+
let (controller, mut processor, endpoint) = init_connection(transport);
51+
52+
b.iter(|| {
53+
controller.send(endpoint, &[0xFF]);
54+
processor.process_poll_event(Some(*TIMEOUT), |_| ());
55+
});
56+
});
57+
}
58+
59+
fn latency_by_native_udp(c: &mut Criterion) {
60+
let msg = format!("latency by native Udp");
61+
c.bench_function(&msg, |b| {
62+
let receiver = UdpSocket::bind("127.0.0.1:0").unwrap();
63+
let addr = receiver.local_addr().unwrap();
64+
65+
let sender = UdpSocket::bind("127.0.0.1:0").unwrap();
66+
sender.connect(addr).unwrap();
67+
68+
let mut buffer: [u8; 1] = [0; 1];
69+
70+
b.iter(|| {
71+
sender.send(&[0xFF]).unwrap();
72+
receiver.recv(&mut buffer).unwrap();
73+
});
74+
});
75+
}
76+
77+
fn latency_by_native_tcp(c: &mut Criterion) {
78+
let msg = format!("latency by native Tcp");
79+
c.bench_function(&msg, |b| {
80+
let listener = TcpListener::bind("127.0.0.1:0").unwrap();
81+
let addr = listener.local_addr().unwrap();
82+
83+
let mut sender = TcpStream::connect(addr).unwrap();
84+
let (mut receiver, _) = listener.accept().unwrap();
85+
86+
let mut buffer: [u8; 1] = [0; 1];
87+
88+
b.iter(|| {
89+
sender.write(&[0xFF]).unwrap();
90+
receiver.read(&mut buffer).unwrap();
91+
});
92+
});
93+
}
94+
95+
fn latency_by_native_framed_tcp(c: &mut Criterion) {
96+
let msg = format!("latency by native FramedTcp");
97+
c.bench_function(&msg, |b| {
98+
let listener = TcpListener::bind("127.0.0.1:0").unwrap();
99+
let addr = listener.local_addr().unwrap();
100+
101+
let mut sender = TcpStream::connect(addr).unwrap();
102+
let (mut receiver, _) = listener.accept().unwrap();
103+
104+
let mut buffer: [u8; 1] = [0; 1];
105+
let mut framming = [0; MAX_ENCODED_SIZE]; // used to avoid a heap allocation
106+
let mut decoder = Decoder::default();
107+
108+
b.iter(|| {
109+
let encoded_size = encoding::encode_size(&[0xFF], &mut framming);
110+
sender.write(&encoded_size).unwrap();
111+
sender.write(&[0xFF]).unwrap();
112+
113+
let mut message_received = false;
114+
while !message_received {
115+
let bytes = receiver.read(&mut buffer).unwrap();
116+
decoder.decode(&buffer[0..bytes], |_decoded_data| message_received = true);
117+
}
118+
});
119+
});
120+
}
121+
122+
#[cfg(feature = "websocket")]
123+
fn latency_by_native_web_socket(c: &mut Criterion) {
124+
let msg = format!("latency by native Ws");
125+
c.bench_function(&msg, |b| {
126+
let listener = TcpListener::bind("127.0.0.1:0").unwrap();
127+
let addr = listener.local_addr().unwrap();
128+
129+
let mut listen_thread = NamespacedThread::spawn("perf-listening", move || {
130+
ws_accept(listener.accept().unwrap().0).unwrap()
131+
});
132+
133+
let url_addr = format!("ws://{}/socket", addr);
134+
let (mut sender, _) = ws_connect(Url::parse(&url_addr).unwrap()).unwrap();
135+
136+
let mut receiver = listen_thread.join();
137+
138+
let message = vec![0xFF];
139+
140+
b.iter(|| {
141+
sender.write_message(Message::Binary(message.clone())).unwrap();
142+
receiver.read_message().unwrap();
143+
});
144+
});
145+
}
146+
147+
fn latency(c: &mut Criterion) {
148+
#[cfg(feature = "udp")]
149+
latency_by(c, Transport::Udp);
150+
#[cfg(feature = "tcp")]
151+
latency_by(c, Transport::Tcp);
152+
#[cfg(feature = "tcp")]
153+
latency_by(c, Transport::FramedTcp);
154+
#[cfg(feature = "websocket")]
155+
latency_by(c, Transport::Ws);
156+
157+
#[cfg(feature = "udp")]
158+
latency_by_native_udp(c);
159+
#[cfg(feature = "tcp")]
160+
latency_by_native_tcp(c);
161+
#[cfg(feature = "tcp")]
162+
latency_by_native_framed_tcp(c);
163+
#[cfg(feature = "websocket")]
164+
latency_by_native_web_socket(c);
165+
}
166+
167+
criterion_group!(benches, latency);
168+
criterion_main!(benches);

benches/performance.rs

Lines changed: 0 additions & 64 deletions
This file was deleted.

docs/performance_benchmarks.md

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# Performance
2+
3+
`message-io` is focused on getting the best performance,
4+
adding the minimal possible overhead over raw OS socket.
5+
As for performance features, it offers:
6+
- Zero-copy read/write messages: Messages sent and received pass their data reference (`&[u8]`)
7+
through the library, from the user-space to OS socket and vice-versa without copies.
8+
This means that the overhead the library adds is independent of the size of the message,
9+
because the library only copies the pointer to the data, no matter the longer this data is.
10+
- Full-duplex: From two different threads, you can simultaneously write and read over the same socket.
11+
- Multiwriter: Over the same [node](basic_concepts.md), you can write simultaneously from any number of different sockets without blocking for each other.
12+
- One internal thread with non-blocking sockets:
13+
Creating new connections and listeners do not add new threads.
14+
All sockets are managed from one thread, saving resources at scaling.
15+
16+
## Benchmarks
17+
18+
The benchmark compares two dimensions:
19+
- *Vertical*: among different transports. How they are compared among them.
20+
- *Horizontal*: between a *native usage* and its usage by `message-io`.
21+
A *native usage* is represented here as the **most basic usage** of a transport in a blocking way
22+
between two sockets, with zero overhead.
23+
When checking the results, take into account that `message-io` manages for you a pool of sockets that behaves in a non-blocking way waking from a OS poll.
24+
It adds some delay to handle these things.
25+
However, in most of the network applications, you will need to build some kind of socket management on top of the *native usage* by yourself. So, in practice, not using `message-io` would also imply an overhead by implementing a management layer.
26+
27+
*Note that the network benchmark results can vary slightly among different runs
28+
since it depends considerably on the OS load at the moment of execution.*
29+
30+
### Throughput benchmarks
31+
The throughput is the amount of data you can transfer by a second, measured in bytes per second.
32+
It starts when the sender sends the first byte and ends when the receiver receives the last byte.
33+
34+
<p align="">
35+
<img src="https://docs.google.com/drawings/d/e/2PACX-1vRbPf-P6iKnLV8xStrJq5jIiIl7slzRPRMUOf9WbrPgpa5FeBq6N-qSJkx46T41LzppUmVBIemT1pS3/pub?w=697&h=573"/>
36+
</p>
37+
38+
The following results are measured for the transmision of 1GB of data by localhost:
39+
40+
| Transport | native | message-io | efficiency |
41+
|:----------:|:--------:|:----------:|:----------:|
42+
| UDP | 7.1 GB/s | 5.9 GB/s | ~83% |
43+
| TCP | 6.4 GB/s | 5.4 GB/s | ~84% |
44+
| Framed TCP | 5.5 GB/s | 5.0 GB/s | ~91% |
45+
| Web Socket | 590 MB/s | 560 MB/s | ~95% |
46+
47+
Take into account that the throughput is measured by localhost.
48+
In localhost the computer makes few more than a data copy,
49+
so the measurement is not hidden by network latency.
50+
This means that the time difference between the *native usage* and `message-io`
51+
measures exactly the overhead that `message-io` adds over that *native usage*.
52+
In this prejudicial conditions for `message-io`, it maintains an efficiency of `80%`-`90%` which in practice (in a real network environment with minimal latency), is higher.
53+
54+
As is mentioned above, note that the *basic usage* is a raw and basic usage of the sockets.
55+
Any real application will add some kind of management over them, adding also an overhead.
56+
57+
If you want to test it yourself, see the [throughput](../examples/throughput) example.
58+
59+
### Latency benchmarks
60+
The latency can be measured as the time a byte takes to be trasferred.
61+
It starts when the sender sends a byte and ends when the receiver receives that byte.
62+
63+
<p align="">
64+
<img src="https://docs.google.com/drawings/d/e/2PACX-1vQJ9bhjVWzSnNQLg75Uaed5ZYUIRoJ03OqEg_HSS8VxZqMlvUUFG6ki_mgDc_MDFKrUmbKb8S3eGHvJ/pub?w=692&h=301"/>
65+
</p>
66+
67+
The following results are measured by transferring 1-byte by localhost:
68+
69+
| Transport | native | message-io | overhead |
70+
|:----------:|:------:|:----------:|:----------:|
71+
| UDP | 1.2 us | 2.1 us | + ~0.9 us |
72+
| TCP | 2.6 us | 3.5 us | + ~0.9 us |
73+
| Framed TCP | 5.2 us | 6.6 us | + ~1.4 us |
74+
| Web Socket | 9.1 us | 11.2 us | + ~2.1 us |
75+
76+
Depending on the transport used, `message-io` adds around `1-2us` of overhead per chunk of data transsmision.
77+
Because it is zero-copy at reading/writing messages,
78+
this overhead is constant and independently of the size of that chunk of data.
79+
The library only copies the pointer to the data.
80+
It means, for example, that sending and receiving a message of 1 Byte or 64KB by *UDP* (if the MTU supports that size), only adds around `0.9us` of overhead to the entire data transsmision.
81+
82+
*Note that *TCP* based protocols could split internally your message into several chunks of data.*
83+
84+
If you want to test it yourself, execute `cargo bench`.

examples/throughput/README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Throughput example
2+
3+
This example shows the throughput among differents transport in both ways: used by `message-io` and used isolated from the library, natively.
4+
5+
The aim is to compare two dimensions: different procols among us and the overhead `message-io` adds.
6+
7+
To run this tests, run:
8+
```sh
9+
cargo run --release --example throughput
10+
```
11+
12+
**NOT FORGET** to run this example with the `--release` flag to get the real measurements.
13+
14+
The throughput is measured by sending *1GB* of data between two connected endpoints by **localhost**.
15+
The measure starts when the sender starts sending the data and finished when the receiver receives
16+
the entire data.
17+
18+
<p align="center">
19+
<img src="https://docs.google.com/drawings/d/e/2PACX-1vRbPf-P6iKnLV8xStrJq5jIiIl7slzRPRMUOf9WbrPgpa5FeBq6N-qSJkx46T41LzppUmVBIemT1pS3/pub?w=697&h=573"/>
20+
</p>
21+
22+
To know more about `message-io` performance and how to interpret the results,
23+
see the [performance](../../docs/performance_benchmarks.md) document.

0 commit comments

Comments
 (0)