Added throughput benchmark example (#72)

lemunozm · web-flow · commit 80ae8e94b39f · 2021-04-14T10:36:03.000+02:00
* Windows fix

* Added throughput benchmark example

* Added throughput documentation

* Added native latencies

* performance latency test with websocket conditional compilation

* Added latency documentation

* Updated performance docs
diff --git a/Cargo.toml b/Cargo.toml
@@ -46,5 +46,5 @@ httparse = "1.3.5"
 doc-comment = "0.3"
 
 [[bench]]
-name = "performance"
+name = "latency"
 harness = false
diff --git a/README.md b/README.md
@@ -55,12 +55,10 @@ You could change the transport of your application in literally one line.
     "One thread to rule them all".
   - Easy error handling:
     do not deal with dark internal `std::io::Error` when send/receive from the network.
-- High performance:
-    - Non-blocking sockets: scale the application without wasting memory and time synchonizing
-    multiple threads.
+- High performance (see the [benchmarks](docs/performance_benchmarks.md)):
     - Write/read messages with zero-copy.
     You write and read directly from the internal OS socket buffer without any copy in the middle by the library.
-    - Full duplex: simultaneous reading/writing operations over same internal OS sockets.
+    - Full duplex: simultaneous reading/writing operations over same internal OS socket.
 - Customizable: `message-io` doesn't have the transport you need?
   Add easily and [adapter](#custom-adapter).
 
diff --git a/benches/latency.rs b/benches/latency.rs
@@ -0,0 +1,168 @@
+use message_io::network::{self, Transport, NetworkController, NetworkProcessor, Endpoint};
+use message_io::util::thread::{NamespacedThread};
+use message_io::util::encoding::{self, Decoder, MAX_ENCODED_SIZE};
+
+use criterion::{criterion_group, criterion_main, Criterion};
+
+#[cfg(feature = "websocket")]
+use tungstenite::{Message, connect as ws_connect, accept as ws_accept};
+use url::{Url};
+
+use std::net::{TcpListener, TcpStream, UdpSocket};
+use std::time::{Duration};
+use std::sync::{
+    Arc,
+    atomic::{AtomicBool, Ordering},
+};
+use std::io::{Write, Read};
+
+lazy_static::lazy_static! {
+    static ref TIMEOUT: Duration = Duration::from_millis(100);
+}
+
+fn init_connection(transport: Transport) -> (NetworkController, NetworkProcessor, Endpoint) {
+    let (controller, mut processor) = network::split();
+
+    let running = Arc::new(AtomicBool::new(true));
+    let mut thread = {
+        let running = running.clone();
+        NamespacedThread::spawn("perf-listening", move || {
+            while running.load(Ordering::Relaxed) {
+                processor.process_poll_event(Some(*TIMEOUT), |_| ());
+            }
+            processor
+        })
+    };
+
+    let receiver_addr = controller.listen(transport, "127.0.0.1:0").unwrap().1;
+    let receiver = controller.connect(transport, receiver_addr).unwrap().0;
+
+    running.store(false, Ordering::Relaxed);
+    let processor = thread.join();
+    // From here, the connection is performed independently of the transport used
+
+    (controller, processor, receiver)
+}
+
+fn latency_by(c: &mut Criterion, transport: Transport) {
+    let msg = format!("latency by {}", transport);
+    c.bench_function(&msg, |b| {
+        let (controller, mut processor, endpoint) = init_connection(transport);
+
+        b.iter(|| {
+            controller.send(endpoint, &[0xFF]);
+            processor.process_poll_event(Some(*TIMEOUT), |_| ());
+        });
+    });
+}
+
+fn latency_by_native_udp(c: &mut Criterion) {
+    let msg = format!("latency by native Udp");
+    c.bench_function(&msg, |b| {
+        let receiver = UdpSocket::bind("127.0.0.1:0").unwrap();
+        let addr = receiver.local_addr().unwrap();
+
+        let sender = UdpSocket::bind("127.0.0.1:0").unwrap();
+        sender.connect(addr).unwrap();
+
+        let mut buffer: [u8; 1] = [0; 1];
+
+        b.iter(|| {
+            sender.send(&[0xFF]).unwrap();
+            receiver.recv(&mut buffer).unwrap();
+        });
+    });
+}
+
+fn latency_by_native_tcp(c: &mut Criterion) {
+    let msg = format!("latency by native Tcp");
+    c.bench_function(&msg, |b| {
+        let listener = TcpListener::bind("127.0.0.1:0").unwrap();
+        let addr = listener.local_addr().unwrap();
+
+        let mut sender = TcpStream::connect(addr).unwrap();
+        let (mut receiver, _) = listener.accept().unwrap();
+
+        let mut buffer: [u8; 1] = [0; 1];
+
+        b.iter(|| {
+            sender.write(&[0xFF]).unwrap();
+            receiver.read(&mut buffer).unwrap();
+        });
+    });
+}
+
+fn latency_by_native_framed_tcp(c: &mut Criterion) {
+    let msg = format!("latency by native FramedTcp");
+    c.bench_function(&msg, |b| {
+        let listener = TcpListener::bind("127.0.0.1:0").unwrap();
+        let addr = listener.local_addr().unwrap();
+
+        let mut sender = TcpStream::connect(addr).unwrap();
+        let (mut receiver, _) = listener.accept().unwrap();
+
+        let mut buffer: [u8; 1] = [0; 1];
+        let mut framming = [0; MAX_ENCODED_SIZE]; // used to avoid a heap allocation
+        let mut decoder = Decoder::default();
+
+        b.iter(|| {
+            let encoded_size = encoding::encode_size(&[0xFF], &mut framming);
+            sender.write(&encoded_size).unwrap();
+            sender.write(&[0xFF]).unwrap();
+
+            let mut message_received = false;
+            while !message_received {
+                let bytes = receiver.read(&mut buffer).unwrap();
+                decoder.decode(&buffer[0..bytes], |_decoded_data| message_received = true);
+            }
+        });
+    });
+}
+
+#[cfg(feature = "websocket")]
+fn latency_by_native_web_socket(c: &mut Criterion) {
+    let msg = format!("latency by native Ws");
+    c.bench_function(&msg, |b| {
+        let listener = TcpListener::bind("127.0.0.1:0").unwrap();
+        let addr = listener.local_addr().unwrap();
+
+        let mut listen_thread = NamespacedThread::spawn("perf-listening", move || {
+            ws_accept(listener.accept().unwrap().0).unwrap()
+        });
+
+        let url_addr = format!("ws://{}/socket", addr);
+        let (mut sender, _) = ws_connect(Url::parse(&url_addr).unwrap()).unwrap();
+
+        let mut receiver = listen_thread.join();
+
+        let message = vec![0xFF];
+
+        b.iter(|| {
+            sender.write_message(Message::Binary(message.clone())).unwrap();
+            receiver.read_message().unwrap();
+        });
+    });
+}
+
+fn latency(c: &mut Criterion) {
+    #[cfg(feature = "udp")]
+    latency_by(c, Transport::Udp);
+    #[cfg(feature = "tcp")]
+    latency_by(c, Transport::Tcp);
+    #[cfg(feature = "tcp")]
+    latency_by(c, Transport::FramedTcp);
+    #[cfg(feature = "websocket")]
+    latency_by(c, Transport::Ws);
+
+    #[cfg(feature = "udp")]
+    latency_by_native_udp(c);
+    #[cfg(feature = "tcp")]
+    latency_by_native_tcp(c);
+    #[cfg(feature = "tcp")]
+    latency_by_native_framed_tcp(c);
+    #[cfg(feature = "websocket")]
+    latency_by_native_web_socket(c);
+}
+
+criterion_group!(benches, latency);
+criterion_main!(benches);
diff --git a/benches/performance.rs b/benches/performance.rs
diff --git a/docs/performance_benchmarks.md b/docs/performance_benchmarks.md
@@ -0,0 +1,84 @@
+# Performance
+
+`message-io` is focused on getting the best performance,
+adding the minimal possible overhead over raw OS socket.
+As for performance features, it offers:
+- Zero-copy read/write messages: Messages sent and received pass their data reference (`&[u8]`)
+through the library, from the user-space to OS socket and vice-versa without copies.
+This means that the overhead the library adds is independent of the size of the message,
+because the library only copies the pointer to the data, no matter the longer this data is.
+- Full-duplex: From two different threads, you can simultaneously write and read over the same socket.
+- Multiwriter: Over the same [node](basic_concepts.md), you can write simultaneously from any number of different sockets without blocking for each other.
+- One internal thread with non-blocking sockets:
+Creating new connections and listeners do not add new threads.
+All sockets are managed from one thread, saving resources at scaling.
+
+## Benchmarks
+
+The benchmark compares two dimensions:
+- *Vertical*: among different transports. How they are compared among them.
+- *Horizontal*: between a *native usage* and its usage by `message-io`.
+  A *native usage* is represented here as the **most basic usage** of a transport in a blocking way
+  between two sockets, with zero overhead.
+  When checking the results, take into account that `message-io` manages for you a pool of sockets that behaves in a non-blocking way waking from a OS poll.
+  It adds some delay to handle these things.
+  However, in most of the network applications, you will need to build some kind of socket management on top of the *native usage* by yourself. So, in practice, not using `message-io` would also imply an overhead by implementing a management layer.
+
+*Note that the network benchmark results can vary slightly among different runs
+since it depends considerably on the OS load at the moment of execution.*
+
+### Throughput benchmarks
+The throughput is the amount of data you can transfer by a second, measured in bytes per second.
+It starts when the sender sends the first byte and ends when the receiver receives the last byte.
+
+<p align="">
+  <img src="https://docs.google.com/drawings/d/e/2PACX-1vRbPf-P6iKnLV8xStrJq5jIiIl7slzRPRMUOf9WbrPgpa5FeBq6N-qSJkx46T41LzppUmVBIemT1pS3/pub?w=697&h=573"/>
+</p>
+
+The following results are measured for the transmision of 1GB of data by localhost:
+
+|  Transport |  native  | message-io | efficiency |
+|:----------:|:--------:|:----------:|:----------:|
+|     UDP    | 7.1 GB/s |  5.9 GB/s  |    ~83%    |
+|     TCP    | 6.4 GB/s |  5.4 GB/s  |    ~84%    |
+| Framed TCP | 5.5 GB/s |  5.0 GB/s  |    ~91%    |
+| Web Socket | 590 MB/s |  560 MB/s  |    ~95%    |
+
+Take into account that the throughput is measured by localhost.
+In localhost the computer makes few more than a data copy,
+so the measurement is not hidden by network latency.
+This means that the time difference between the *native usage* and `message-io`
+measures exactly the overhead that `message-io` adds over that *native usage*.
+In this prejudicial conditions for `message-io`, it maintains an efficiency of `80%`-`90%` which in practice (in a real network environment with minimal latency), is higher.
+
+As is mentioned above, note that the *basic usage* is a raw and basic usage of the sockets.
+Any real application will add some kind of management over them, adding also an overhead.
+
+If you want to test it yourself, see the [throughput](../examples/throughput) example.
+
+### Latency benchmarks
+The latency can be measured as the time a byte takes to be trasferred.
+It starts when the sender sends a byte and ends when the receiver receives that byte.
+
+<p align="">
+  <img src="https://docs.google.com/drawings/d/e/2PACX-1vQJ9bhjVWzSnNQLg75Uaed5ZYUIRoJ03OqEg_HSS8VxZqMlvUUFG6ki_mgDc_MDFKrUmbKb8S3eGHvJ/pub?w=692&h=301"/>
+</p>
+
+The following results are measured by transferring 1-byte by localhost:
+
+|  Transport | native | message-io |  overhead  |
+|:----------:|:------:|:----------:|:----------:|
+|     UDP    | 1.2 us |   2.1 us   |  + ~0.9 us |
+|     TCP    | 2.6 us |   3.5 us   |  + ~0.9 us |
+| Framed TCP | 5.2 us |   6.6 us   |  + ~1.4 us |
+| Web Socket | 9.1 us |   11.2 us  |  + ~2.1 us |
+
+Depending on the transport used, `message-io` adds around `1-2us` of overhead per chunk of data transsmision.
+Because it is zero-copy at reading/writing messages,
+this overhead is constant and independently of the size of that chunk of data.
+The library only copies the pointer to the data.
+It means, for example, that sending and receiving a message of 1 Byte or 64KB by *UDP* (if the MTU supports that size), only adds around `0.9us` of overhead to the entire data transsmision.
+
+*Note that *TCP* based protocols could split internally your message into several chunks of data.*
+
+If you want to test it yourself, execute `cargo bench`.
diff --git a/examples/throughput/README.md b/examples/throughput/README.md
@@ -0,0 +1,23 @@
+# Throughput example
+
+This example shows the throughput among differents transport in both ways: used by `message-io` and used isolated from the library, natively.
+
+The aim is to compare two dimensions: different procols among us and the overhead `message-io` adds.
+
+To run this tests, run:
+```sh
+cargo run --release --example throughput
+```
+
+**NOT FORGET** to run this example with the `--release` flag to get the real measurements.
+
+The throughput is measured by sending *1GB* of data between two connected endpoints by **localhost**.
+The measure starts when the sender starts sending the data and finished when the receiver receives
+the entire data.
+
+<p align="center">
+  <img src="https://docs.google.com/drawings/d/e/2PACX-1vRbPf-P6iKnLV8xStrJq5jIiIl7slzRPRMUOf9WbrPgpa5FeBq6N-qSJkx46T41LzppUmVBIemT1pS3/pub?w=697&h=573"/>
+</p>
+
+To know more about `message-io` performance and how to interpret the results,
+see the [performance](../../docs/performance_benchmarks.md) document.
diff --git a/examples/throughput/main.rs b/examples/throughput/main.rs