diff --git a/Cargo.lock b/Cargo.lock index b11b7ee..8364a60 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -177,9 +177,9 @@ checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" [[package]] name = "rand" -version = "0.9.1" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ "rand_chacha", "rand_core", @@ -232,7 +232,7 @@ checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" [[package]] name = "rlnc" -version = "0.8.4" +version = "0.8.5" dependencies = [ "divan", "rand", diff --git a/Cargo.toml b/Cargo.toml index 39d5794..d686e53 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rlnc" -version = "0.8.4" +version = "0.8.5" edition = "2024" resolver = "3" rust-version = "1.89.0" @@ -20,7 +20,7 @@ categories = ["network-programming", "encoding", "algorithms"] [dependencies] -rand = "=0.9.1" +rand = "=0.9.2" rayon = { version = "=1.10.0", optional = true } [dev-dependencies] diff --git a/README.md b/README.md index 7ce39b3..c7d63f0 100644 --- a/README.md +++ b/README.md @@ -4,15 +4,15 @@ Blazing Fast Erasure-Coding with Random Linear Network Coding (RLNC) ## Introduction -`rlnc` is a Rust library crate that implements an advanced erasure-coding technique Random Linear Network Coding (RLNC) over galois field $GF(2^8)$ with irreducible polynomial $x^8 + x^4 + x^3 + x^2 + 1$. This library provides functionalities for blazing fast erasure-coding of data, reconstructing original data from coded pieces, and recoding existing coded pieces to new erasure-coded pieces, without ever decoding it back to original data, using AVX512, AVX2 and SSSE3 intrinsics on `x86_64` and NEON intrinsics on `arm64`, for fast vector multiplication by a single scalar over $GF(2^8)$. +`rlnc` is a Rust library crate that implements an advanced erasure-coding technique Random Linear Network Coding (RLNC) over galois field $GF(2^8)$ with irreducible polynomial $x^8 + x^4 + x^3 + x + 1$. This library provides functionalities for blazing fast erasure-coding of data, reconstructing original data from coded pieces, and recoding existing coded pieces to new erasure-coded pieces, without ever decoding it back to original data. It performs runtime introspection of platform and uses the best of GFNI, AVX512, AVX2 and SSSE3 intrinsics on `x86_64` and NEON intrinsics on `arm64`, for fast vector multiplication by a single scalar over $GF(2^8)$. -Following charts show performance of RLNC encoder, recoder and decoder on **AWS EC2 `m7a.large` with AMD EPYC 9R14** - which has AVX512 support. More performance benchmark results [below](#benchmarking). +Following charts show performance of RLNC encoder, recoder and decoder on **AWS EC2 `m7a.large` with AMD EPYC 9R14** - which has GFNI + AVX512 support. More performance benchmark results [below](#benchmarking). -![rlnc-encoder-on-x86_64_with-amd-avx512](./plots/rlnc-encoder-on-x86_64_with-amd-avx512.png) +![rlnc-encoder-on-x86_64_with-amd-gfni](./plots/rlnc-encoder-on-x86_64_with-amd-gfni.png) -![rlnc-recoder-on-x86_64_with-amd-avx512](./plots/rlnc-recoder-on-x86_64_with-amd-avx512.png) +![rlnc-recoder-on-x86_64_with-amd-gfni](./plots/rlnc-recoder-on-x86_64_with-amd-gfni.png) -![rlnc-decoder-on-x86_64_with-amd-avx512](./plots/rlnc-decoder-on-x86_64_with-amd-avx512.png) +![rlnc-decoder-on-x86_64_with-amd-gfni](./plots/rlnc-decoder-on-x86_64_with-amd-gfni.png) --- **Let's take a practical example of how RLNC can be useful.** @@ -116,16 +116,20 @@ Coverage Results: || Tested/Total Lines: || src/common/errors.rs: 0/1 || src/common/gf256.rs: 9/11 -|| src/common/simd/mod.rs: 6/9 +|| src/common/simd/mod.rs: 8/12 || src/common/simd/x86/avx2.rs: 10/10 -|| src/common/simd/x86/mod.rs: 6/15 +|| src/common/simd/x86/avx512.rs: 0/10 +|| src/common/simd/x86/gfni/m128i.rs: 0/5 +|| src/common/simd/x86/gfni/m256i.rs: 0/5 +|| src/common/simd/x86/gfni/m512i.rs: 0/5 +|| src/common/simd/x86/mod.rs: 18/33 || src/common/simd/x86/ssse3.rs: 0/10 -|| src/full/decoder.rs: 25/32 +|| src/full/decoder.rs: 26/31 || src/full/decoder_matrix.rs: 51/58 -|| src/full/encoder.rs: 24/27 -|| src/full/recoder.rs: 28/36 +|| src/full/encoder.rs: 25/33 +|| src/full/recoder.rs: 27/39 || -76.08% coverage, 159/209 lines covered +66.16% coverage, 174/263 lines covered ``` This will create an HTML coverage report at `tarpaulin-report.html` that you can open in your web browser to view detailed line-by-line coverage information for all source files. @@ -3781,11 +3785,11 @@ To use `rlnc` library crate in your Rust project, add it as a dependency in your ```toml [dependencies] -rlnc = "=0.8.4" # On x86_64 and aarch64 targets, it offers fast encoding, recoding and decoding, using SIMD intrinsics. +rlnc = "=0.8.5" # On x86_64 and aarch64 targets, it offers fast encoding, recoding and decoding, using SIMD intrinsics. # or -rlnc = { version = "=0.8.4", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding and recoding. Note, this feature, doesn't yet parallelize RLNC decoding. +rlnc = { version = "=0.8.5", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding and recoding. Note, this feature, doesn't yet parallelize RLNC decoding. -rand = { version = "=0.9.1" } # Required for random number generation +rand = { version = "=0.9.2" } # Required for random number generation ``` ### Full RLNC Workflow Example @@ -3806,6 +3810,7 @@ See [full_rlnc.rs](./examples/full_rlnc.rs) example program. Run the program wit ```bash Initialized Encoder with 10240 bytes of data, split into 32 pieces, each of 321 bytes. Each coded piece will be of 353 bytes. +Overhead of encoding: 10.31% Initializing Decoder, expecting 32 original pieces of 321 bytes each. Sender generating 16 initial coded pieces... diff --git a/plots/README.md b/plots/README.md index aa94f04..3e350c4 100644 --- a/plots/README.md +++ b/plots/README.md @@ -21,7 +21,7 @@ ```bash # This patch was generated on git tag v0.8.4 of this crate # If the benchmark files are not touched after that, this patch should work. - git apply plots/scripts/visualize-plot-bench-result.patch + git apply plots/visualize-plot-bench-result.patch ``` 4. Run benchmark program and collect console output. @@ -71,3 +71,11 @@ All scripts are inside [scripts](./scripts) directory. ![rlnc-recoder-on-x86_64-with-amd-avx512](./rlnc-recoder-on-x86_64_with-amd-avx512.png) ![rlnc-decoder-on-x86_64-with-amd-avx512](./rlnc-decoder-on-x86_64_with-amd-avx512.png) + +## Performance Benchmarking on AMD x86_64 with GFNI+AV512 + +![rlnc-encoder-on-x86_64-with-amd-gfni](./rlnc-encoder-on-x86_64_with-amd-gfni.png) + +![rlnc-recoder-on-x86_64-with-amd-gfni](./rlnc-recoder-on-x86_64_with-amd-gfni.png) + +![rlnc-decoder-on-x86_64-with-amd-gfni](./rlnc-decoder-on-x86_64_with-amd-gfni.png) diff --git a/plots/rlnc-decoder-on-x86_64_with-amd-gfni.png b/plots/rlnc-decoder-on-x86_64_with-amd-gfni.png new file mode 100644 index 0000000..2d31a9b Binary files /dev/null and b/plots/rlnc-decoder-on-x86_64_with-amd-gfni.png differ diff --git a/plots/rlnc-encoder-on-x86_64_with-amd-gfni.png b/plots/rlnc-encoder-on-x86_64_with-amd-gfni.png new file mode 100644 index 0000000..2829d52 Binary files /dev/null and b/plots/rlnc-encoder-on-x86_64_with-amd-gfni.png differ diff --git a/plots/rlnc-recoder-on-x86_64_with-amd-gfni.png b/plots/rlnc-recoder-on-x86_64_with-amd-gfni.png new file mode 100644 index 0000000..03ddee3 Binary files /dev/null and b/plots/rlnc-recoder-on-x86_64_with-amd-gfni.png differ diff --git a/src/common/gf256.rs b/src/common/gf256.rs index 498f456..e57882a 100644 --- a/src/common/gf256.rs +++ b/src/common/gf256.rs @@ -14,41 +14,41 @@ pub const GF256_BIT_WIDTH: usize = u8::BITS as usize; pub const GF256_HALF_ORDER: usize = 1usize << (GF256_BIT_WIDTH / 2); const GF256_LOG_TABLE: [u8; GF256_ORDER] = [ - 0, 0, 1, 25, 2, 50, 26, 198, 3, 223, 51, 238, 27, 104, 199, 75, 4, 100, 224, 14, 52, 141, 239, 129, 28, 193, 105, 248, 200, 8, 76, 113, 5, 138, 101, 47, - 225, 36, 15, 33, 53, 147, 142, 218, 240, 18, 130, 69, 29, 181, 194, 125, 106, 39, 249, 185, 201, 154, 9, 120, 77, 228, 114, 166, 6, 191, 139, 98, 102, 221, - 48, 253, 226, 152, 37, 179, 16, 145, 34, 136, 54, 208, 148, 206, 143, 150, 219, 189, 241, 210, 19, 92, 131, 56, 70, 64, 30, 66, 182, 163, 195, 72, 126, - 110, 107, 58, 40, 84, 250, 133, 186, 61, 202, 94, 155, 159, 10, 21, 121, 43, 78, 212, 229, 172, 115, 243, 167, 87, 7, 112, 192, 247, 140, 128, 99, 13, 103, - 74, 222, 237, 49, 197, 254, 24, 227, 165, 153, 119, 38, 184, 180, 124, 17, 68, 146, 217, 35, 32, 137, 46, 55, 63, 209, 91, 149, 188, 207, 205, 144, 135, - 151, 178, 220, 252, 190, 97, 242, 86, 211, 171, 20, 42, 93, 158, 132, 60, 57, 83, 71, 109, 65, 162, 31, 45, 67, 216, 183, 123, 164, 118, 196, 23, 73, 236, - 127, 12, 111, 246, 108, 161, 59, 82, 41, 157, 85, 170, 251, 96, 134, 177, 187, 204, 62, 90, 203, 89, 95, 176, 156, 169, 160, 81, 11, 245, 22, 235, 122, - 117, 44, 215, 79, 174, 213, 233, 230, 231, 173, 232, 116, 214, 244, 234, 168, 80, 88, 175, + 0, 0, 25, 1, 50, 2, 26, 198, 75, 199, 27, 104, 51, 238, 223, 3, 100, 4, 224, 14, 52, 141, 129, 239, 76, 113, 8, 200, 248, 105, 28, 193, 125, 194, 29, 181, + 249, 185, 39, 106, 77, 228, 166, 114, 154, 201, 9, 120, 101, 47, 138, 5, 33, 15, 225, 36, 18, 240, 130, 69, 53, 147, 218, 142, 150, 143, 219, 189, 54, 208, + 206, 148, 19, 92, 210, 241, 64, 70, 131, 56, 102, 221, 253, 48, 191, 6, 139, 98, 179, 37, 226, 152, 34, 136, 145, 16, 126, 110, 72, 195, 163, 182, 30, 66, + 58, 107, 40, 84, 250, 133, 61, 186, 43, 121, 10, 21, 155, 159, 94, 202, 78, 212, 172, 229, 243, 115, 167, 87, 175, 88, 168, 80, 244, 234, 214, 116, 79, + 174, 233, 213, 231, 230, 173, 232, 44, 215, 117, 122, 235, 22, 11, 245, 89, 203, 95, 176, 156, 169, 81, 160, 127, 12, 246, 111, 23, 196, 73, 236, 216, 67, + 31, 45, 164, 118, 123, 183, 204, 187, 62, 90, 251, 96, 177, 134, 59, 82, 161, 108, 170, 85, 41, 157, 151, 178, 135, 144, 97, 190, 220, 252, 188, 149, 207, + 205, 55, 63, 91, 209, 83, 57, 132, 60, 65, 162, 109, 71, 20, 42, 158, 93, 86, 242, 211, 171, 68, 17, 146, 217, 35, 32, 46, 137, 180, 124, 184, 38, 119, + 153, 227, 165, 103, 74, 237, 222, 197, 49, 254, 24, 13, 99, 140, 128, 192, 247, 112, 7, ]; const GF256_EXP_TABLE: [u8; 2 * GF256_ORDER - 2] = [ - 1, 2, 4, 8, 16, 32, 64, 128, 29, 58, 116, 232, 205, 135, 19, 38, 76, 152, 45, 90, 180, 117, 234, 201, 143, 3, 6, 12, 24, 48, 96, 192, 157, 39, 78, 156, 37, - 74, 148, 53, 106, 212, 181, 119, 238, 193, 159, 35, 70, 140, 5, 10, 20, 40, 80, 160, 93, 186, 105, 210, 185, 111, 222, 161, 95, 190, 97, 194, 153, 47, 94, - 188, 101, 202, 137, 15, 30, 60, 120, 240, 253, 231, 211, 187, 107, 214, 177, 127, 254, 225, 223, 163, 91, 182, 113, 226, 217, 175, 67, 134, 17, 34, 68, - 136, 13, 26, 52, 104, 208, 189, 103, 206, 129, 31, 62, 124, 248, 237, 199, 147, 59, 118, 236, 197, 151, 51, 102, 204, 133, 23, 46, 92, 184, 109, 218, 169, - 79, 158, 33, 66, 132, 21, 42, 84, 168, 77, 154, 41, 82, 164, 85, 170, 73, 146, 57, 114, 228, 213, 183, 115, 230, 209, 191, 99, 198, 145, 63, 126, 252, 229, - 215, 179, 123, 246, 241, 255, 227, 219, 171, 75, 150, 49, 98, 196, 149, 55, 110, 220, 165, 87, 174, 65, 130, 25, 50, 100, 200, 141, 7, 14, 28, 56, 112, - 224, 221, 167, 83, 166, 81, 162, 89, 178, 121, 242, 249, 239, 195, 155, 43, 86, 172, 69, 138, 9, 18, 36, 72, 144, 61, 122, 244, 245, 247, 243, 251, 235, - 203, 139, 11, 22, 44, 88, 176, 125, 250, 233, 207, 131, 27, 54, 108, 216, 173, 71, 142, 1, 2, 4, 8, 16, 32, 64, 128, 29, 58, 116, 232, 205, 135, 19, 38, - 76, 152, 45, 90, 180, 117, 234, 201, 143, 3, 6, 12, 24, 48, 96, 192, 157, 39, 78, 156, 37, 74, 148, 53, 106, 212, 181, 119, 238, 193, 159, 35, 70, 140, 5, - 10, 20, 40, 80, 160, 93, 186, 105, 210, 185, 111, 222, 161, 95, 190, 97, 194, 153, 47, 94, 188, 101, 202, 137, 15, 30, 60, 120, 240, 253, 231, 211, 187, - 107, 214, 177, 127, 254, 225, 223, 163, 91, 182, 113, 226, 217, 175, 67, 134, 17, 34, 68, 136, 13, 26, 52, 104, 208, 189, 103, 206, 129, 31, 62, 124, 248, - 237, 199, 147, 59, 118, 236, 197, 151, 51, 102, 204, 133, 23, 46, 92, 184, 109, 218, 169, 79, 158, 33, 66, 132, 21, 42, 84, 168, 77, 154, 41, 82, 164, 85, - 170, 73, 146, 57, 114, 228, 213, 183, 115, 230, 209, 191, 99, 198, 145, 63, 126, 252, 229, 215, 179, 123, 246, 241, 255, 227, 219, 171, 75, 150, 49, 98, - 196, 149, 55, 110, 220, 165, 87, 174, 65, 130, 25, 50, 100, 200, 141, 7, 14, 28, 56, 112, 224, 221, 167, 83, 166, 81, 162, 89, 178, 121, 242, 249, 239, - 195, 155, 43, 86, 172, 69, 138, 9, 18, 36, 72, 144, 61, 122, 244, 245, 247, 243, 251, 235, 203, 139, 11, 22, 44, 88, 176, 125, 250, 233, 207, 131, 27, 54, - 108, 216, 173, 71, 142, + 1, 3, 5, 15, 17, 51, 85, 255, 26, 46, 114, 150, 161, 248, 19, 53, 95, 225, 56, 72, 216, 115, 149, 164, 247, 2, 6, 10, 30, 34, 102, 170, 229, 52, 92, 228, + 55, 89, 235, 38, 106, 190, 217, 112, 144, 171, 230, 49, 83, 245, 4, 12, 20, 60, 68, 204, 79, 209, 104, 184, 211, 110, 178, 205, 76, 212, 103, 169, 224, 59, + 77, 215, 98, 166, 241, 8, 24, 40, 120, 136, 131, 158, 185, 208, 107, 189, 220, 127, 129, 152, 179, 206, 73, 219, 118, 154, 181, 196, 87, 249, 16, 48, 80, + 240, 11, 29, 39, 105, 187, 214, 97, 163, 254, 25, 43, 125, 135, 146, 173, 236, 47, 113, 147, 174, 233, 32, 96, 160, 251, 22, 58, 78, 210, 109, 183, 194, + 93, 231, 50, 86, 250, 21, 63, 65, 195, 94, 226, 61, 71, 201, 64, 192, 91, 237, 44, 116, 156, 191, 218, 117, 159, 186, 213, 100, 172, 239, 42, 126, 130, + 157, 188, 223, 122, 142, 137, 128, 155, 182, 193, 88, 232, 35, 101, 175, 234, 37, 111, 177, 200, 67, 197, 84, 252, 31, 33, 99, 165, 244, 7, 9, 27, 45, 119, + 153, 176, 203, 70, 202, 69, 207, 74, 222, 121, 139, 134, 145, 168, 227, 62, 66, 198, 81, 243, 14, 18, 54, 90, 238, 41, 123, 141, 140, 143, 138, 133, 148, + 167, 242, 13, 23, 57, 75, 221, 124, 132, 151, 162, 253, 28, 36, 108, 180, 199, 82, 246, 1, 3, 5, 15, 17, 51, 85, 255, 26, 46, 114, 150, 161, 248, 19, 53, + 95, 225, 56, 72, 216, 115, 149, 164, 247, 2, 6, 10, 30, 34, 102, 170, 229, 52, 92, 228, 55, 89, 235, 38, 106, 190, 217, 112, 144, 171, 230, 49, 83, 245, 4, + 12, 20, 60, 68, 204, 79, 209, 104, 184, 211, 110, 178, 205, 76, 212, 103, 169, 224, 59, 77, 215, 98, 166, 241, 8, 24, 40, 120, 136, 131, 158, 185, 208, + 107, 189, 220, 127, 129, 152, 179, 206, 73, 219, 118, 154, 181, 196, 87, 249, 16, 48, 80, 240, 11, 29, 39, 105, 187, 214, 97, 163, 254, 25, 43, 125, 135, + 146, 173, 236, 47, 113, 147, 174, 233, 32, 96, 160, 251, 22, 58, 78, 210, 109, 183, 194, 93, 231, 50, 86, 250, 21, 63, 65, 195, 94, 226, 61, 71, 201, 64, + 192, 91, 237, 44, 116, 156, 191, 218, 117, 159, 186, 213, 100, 172, 239, 42, 126, 130, 157, 188, 223, 122, 142, 137, 128, 155, 182, 193, 88, 232, 35, 101, + 175, 234, 37, 111, 177, 200, 67, 197, 84, 252, 31, 33, 99, 165, 244, 7, 9, 27, 45, 119, 153, 176, 203, 70, 202, 69, 207, 74, 222, 121, 139, 134, 145, 168, + 227, 62, 66, 198, 81, 243, 14, 18, 54, 90, 238, 41, 123, 141, 140, 143, 138, 133, 148, 167, 242, 13, 23, 57, 75, 221, 124, 132, 151, 162, 253, 28, 36, 108, + 180, 199, 82, 246, ]; /// Galois Field GF(2^8) wrapper type. /// /// This type represents elements of the Galois Field GF(2^8), which is commonly used in coding theory, cryptography, and error correction codes. /// It supports basic arithmetic operations such as addition, subtraction, multiplication, and division. -/// The operations are defined over the finite field GF(2^8) with the irreducible polynomial x^8 + x^4 + x^3 + x^2 + 1 -/// and the primitive element x = 2. +/// The operations are defined over the finite field GF(2^8) with the irreducible polynomial x^8 + x^4 + x^3 + x + 1 +/// and the primitive element x = 3. /// /// We assign the `transparent` attribute to ensure that the Rust compiler representation of `Gf256` is the same as its underlying `u8` value, /// providing a guarantee that it can be used interchangeably with `u8` in contexts where the underlying value is needed. @@ -79,9 +79,9 @@ impl Gf256 { Gf256::new(1) } - /// Returns primitive element x, for GF(2^8) field with irreducible polynomial x^8 + x^4 + x^3 + x^2 + 1. + /// Returns primitive element x + 1, for GF(2^8) field with irreducible polynomial x^8 + x^4 + x^3 + x + 1. pub const fn primitive_element() -> Self { - Gf256::new(2) + Gf256::new(3) } /// Compile-time executable multiplication of two bytes, over GF(2^8). diff --git a/src/common/simd/x86/gfni/m128i.rs b/src/common/simd/x86/gfni/m128i.rs new file mode 100644 index 0000000..62283c3 --- /dev/null +++ b/src/common/simd/x86/gfni/m128i.rs @@ -0,0 +1,54 @@ +use crate::common::gf256::{GF256_HALF_ORDER, Gf256}; + +#[cfg(target_arch = "x86")] +use std::arch::x86::*; + +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[target_feature(enable = "gfni", enable = "avx512vl")] +pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { + let mut iter = vec.chunks_exact_mut(2 * GF256_HALF_ORDER); + + unsafe { + let scalar_simd = _mm_set1_epi8(scalar as i8); + for chunk in iter.by_ref() { + let chunk_simd = _mm_loadu_si128(chunk.as_ptr().cast()); + let res = _mm_gf2p8mul_epi8(chunk_simd, scalar_simd); + + _mm_storeu_si128(chunk.as_mut_ptr().cast(), res); + } + } + + iter.into_remainder().iter_mut().for_each(|symbol| { + *symbol = Gf256::mul_const(*symbol, scalar); + }); +} + +#[target_feature(enable = "gfni", enable = "avx512vl")] +pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) { + let mut add_vec_iter = add_into_vec.chunks_exact_mut(2 * GF256_HALF_ORDER); + let mut mul_vec_iter = mul_vec.chunks_exact(2 * GF256_HALF_ORDER); + + unsafe { + let scalar_simd = _mm_set1_epi8(scalar as i8); + + for (add_vec_chunk, mul_vec_chunk) in add_vec_iter.by_ref().zip(mul_vec_iter.by_ref()) { + let mul_vec_chunk_simd = _mm_loadu_si128(mul_vec_chunk.as_ptr().cast()); + let scaled_res = _mm_gf2p8mul_epi8(mul_vec_chunk_simd, scalar_simd); + + let add_vec_chunk_simd = _mm_loadu_si128(add_vec_chunk.as_ptr().cast()); + let accum_res = _mm_xor_si128(add_vec_chunk_simd, scaled_res); + + _mm_storeu_si128(add_vec_chunk.as_mut_ptr().cast(), accum_res); + } + } + + add_vec_iter + .into_remainder() + .iter_mut() + .zip(mul_vec_iter.remainder().iter().map(|&src_symbol| Gf256::mul_const(src_symbol, scalar))) + .for_each(|(res, scaled)| { + *res ^= scaled; + }); +} diff --git a/src/common/simd/x86/gfni/m256i.rs b/src/common/simd/x86/gfni/m256i.rs new file mode 100644 index 0000000..f101bcd --- /dev/null +++ b/src/common/simd/x86/gfni/m256i.rs @@ -0,0 +1,54 @@ +use crate::common::gf256::{GF256_HALF_ORDER, Gf256}; + +#[cfg(target_arch = "x86")] +use std::arch::x86::*; + +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[target_feature(enable = "gfni", enable = "avx512vl")] +pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { + let mut iter = vec.chunks_exact_mut(2 * GF256_HALF_ORDER); + + unsafe { + let scalar_simd = _mm256_set1_epi8(scalar as i8); + for chunk in iter.by_ref() { + let chunk_simd = _mm256_loadu_si256(chunk.as_ptr().cast()); + let res = _mm256_gf2p8mul_epi8(chunk_simd, scalar_simd); + + _mm256_storeu_si256(chunk.as_mut_ptr().cast(), res); + } + } + + iter.into_remainder().iter_mut().for_each(|symbol| { + *symbol = Gf256::mul_const(*symbol, scalar); + }); +} + +#[target_feature(enable = "gfni", enable = "avx512vl")] +pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) { + let mut add_vec_iter = add_into_vec.chunks_exact_mut(2 * GF256_HALF_ORDER); + let mut mul_vec_iter = mul_vec.chunks_exact(2 * GF256_HALF_ORDER); + + unsafe { + let scalar_simd = _mm256_set1_epi8(scalar as i8); + + for (add_vec_chunk, mul_vec_chunk) in add_vec_iter.by_ref().zip(mul_vec_iter.by_ref()) { + let mul_vec_chunk_simd = _mm256_loadu_si256(mul_vec_chunk.as_ptr().cast()); + let scaled_res = _mm256_gf2p8mul_epi8(mul_vec_chunk_simd, scalar_simd); + + let add_vec_chunk_simd = _mm256_loadu_si256(add_vec_chunk.as_ptr().cast()); + let accum_res = _mm256_xor_si256(add_vec_chunk_simd, scaled_res); + + _mm256_storeu_si256(add_vec_chunk.as_mut_ptr().cast(), accum_res); + } + } + + add_vec_iter + .into_remainder() + .iter_mut() + .zip(mul_vec_iter.remainder().iter().map(|&src_symbol| Gf256::mul_const(src_symbol, scalar))) + .for_each(|(res, scaled)| { + *res ^= scaled; + }); +} diff --git a/src/common/simd/x86/gfni/m512i.rs b/src/common/simd/x86/gfni/m512i.rs new file mode 100644 index 0000000..6b9049f --- /dev/null +++ b/src/common/simd/x86/gfni/m512i.rs @@ -0,0 +1,54 @@ +use crate::common::gf256::{GF256_HALF_ORDER, Gf256}; + +#[cfg(target_arch = "x86")] +use std::arch::x86::*; + +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[target_feature(enable = "gfni", enable = "avx512f")] +pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) { + let mut iter = vec.chunks_exact_mut(4 * GF256_HALF_ORDER); + + unsafe { + let scalar_simd = _mm512_set1_epi8(scalar as i8); + for chunk in iter.by_ref() { + let chunk_simd = _mm512_loadu_si512(chunk.as_ptr().cast()); + let res = _mm512_gf2p8mul_epi8(chunk_simd, scalar_simd); + + _mm512_storeu_si512(chunk.as_mut_ptr().cast(), res); + } + } + + iter.into_remainder().iter_mut().for_each(|symbol| { + *symbol = Gf256::mul_const(*symbol, scalar); + }); +} + +#[target_feature(enable = "gfni", enable = "avx512f")] +pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) { + let mut add_vec_iter = add_into_vec.chunks_exact_mut(4 * GF256_HALF_ORDER); + let mut mul_vec_iter = mul_vec.chunks_exact(4 * GF256_HALF_ORDER); + + unsafe { + let scalar_simd = _mm512_set1_epi8(scalar as i8); + + for (add_vec_chunk, mul_vec_chunk) in add_vec_iter.by_ref().zip(mul_vec_iter.by_ref()) { + let mul_vec_chunk_simd = _mm512_loadu_si512(mul_vec_chunk.as_ptr().cast()); + let scaled_res = _mm512_gf2p8mul_epi8(mul_vec_chunk_simd, scalar_simd); + + let add_vec_chunk_simd = _mm512_loadu_si512(add_vec_chunk.as_ptr().cast()); + let accum_res = _mm512_xor_si512(add_vec_chunk_simd, scaled_res); + + _mm512_storeu_si512(add_vec_chunk.as_mut_ptr().cast(), accum_res); + } + } + + add_vec_iter + .into_remainder() + .iter_mut() + .zip(mul_vec_iter.remainder().iter().map(|&src_symbol| Gf256::mul_const(src_symbol, scalar))) + .for_each(|(res, scaled)| { + *res ^= scaled; + }); +} diff --git a/src/common/simd/x86/gfni/mod.rs b/src/common/simd/x86/gfni/mod.rs new file mode 100644 index 0000000..c988b66 --- /dev/null +++ b/src/common/simd/x86/gfni/mod.rs @@ -0,0 +1,3 @@ +pub mod m128i; +pub mod m256i; +pub mod m512i; diff --git a/src/common/simd/x86/mod.rs b/src/common/simd/x86/mod.rs index ba8a66d..e3b24a6 100644 --- a/src/common/simd/x86/mod.rs +++ b/src/common/simd/x86/mod.rs @@ -1,8 +1,24 @@ mod avx2; mod avx512; +mod gfni; mod ssse3; pub(super) fn gf256_inplace_mul_vec_by_scalar(vec: &mut [u8], scalar: u8) -> bool { + if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx512f") { + unsafe { gfni::m512i::mul_vec_by_scalar(vec, scalar) }; + return true; + } + + if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx512vl") { + unsafe { gfni::m256i::mul_vec_by_scalar(vec, scalar) }; + return true; + } + + if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx512vl") { + unsafe { gfni::m128i::mul_vec_by_scalar(vec, scalar) }; + return true; + } + if is_x86_feature_detected!("avx512bw") { unsafe { avx512::mul_vec_by_scalar(vec, scalar) }; return true; @@ -41,6 +57,21 @@ pub(super) fn gf256_inplace_add_vectors(vec_dst: &mut [u8], vec_src: &[u8]) -> b } pub(super) fn gf256_mul_vec_by_scalar_then_add_into_vec(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) -> bool { + if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx512f") { + unsafe { gfni::m512i::mul_vec_by_scalar_then_add_into(add_into_vec, mul_vec, scalar) }; + return true; + } + + if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx512vl") { + unsafe { gfni::m256i::mul_vec_by_scalar_then_add_into(add_into_vec, mul_vec, scalar) }; + return true; + } + + if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx512vl") { + unsafe { gfni::m128i::mul_vec_by_scalar_then_add_into(add_into_vec, mul_vec, scalar) }; + return true; + } + if is_x86_feature_detected!("avx512bw") { unsafe { avx512::mul_vec_by_scalar_then_add_into(add_into_vec, mul_vec, scalar) }; return true; diff --git a/src/full/encoder.rs b/src/full/encoder.rs index e1bc9ee..e5f92bd 100644 --- a/src/full/encoder.rs +++ b/src/full/encoder.rs @@ -243,10 +243,10 @@ impl Encoder { return Err(RLNCError::InvalidOutputBuffer); } - let (coding_vector, mut coded_data) = full_coded_piece.split_at_mut(self.piece_count); + let (coding_vector, coded_data) = full_coded_piece.split_at_mut(self.piece_count); rng.fill_bytes(coding_vector); - self.code_with_coding_vector(&coding_vector, &mut coded_data) + self.code_with_coding_vector(coding_vector, coded_data) } /// Produces a new coded piece, random sampling a coding vector. diff --git a/src/full/recoder.rs b/src/full/recoder.rs index 8702261..d00d544 100644 --- a/src/full/recoder.rs +++ b/src/full/recoder.rs @@ -124,7 +124,7 @@ impl Recoder { return Err(RLNCError::InvalidOutputBuffer); } - let (computed_coding_vector, mut recoded_data) = full_recoded_piece.split_at_mut(self.num_pieces_coded_together); + let (computed_coding_vector, recoded_data) = full_recoded_piece.split_at_mut(self.num_pieces_coded_together); // Compute the resulting coding vector for the original source pieces by multiplying // the random sampled recoding vector by the matrix of received coding vectors. @@ -145,7 +145,7 @@ impl Recoder { unsafe { self.encoder - .code_with_coding_vector(&self.random_recoding_vector, &mut recoded_data) + .code_with_coding_vector(&self.random_recoding_vector, recoded_data) .unwrap_unchecked() }; diff --git a/src/lib.rs b/src/lib.rs index 643b4d7..06d653e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -115,9 +115,9 @@ //! //! ```toml //! [dependencies] -//! rlnc = "=0.8.4" # On x86_64 and aarch64 targets, it offers fast encoding, recoding and decoding, using SIMD intrinsics. +//! rlnc = "=0.8.5" # On x86_64 and aarch64 targets, it offers fast encoding, recoding and decoding, using SIMD intrinsics. //! # or -//! rlnc = { version = "=0.8.4", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding/ recoding. Decoding is not yet parallelized. +//! rlnc = { version = "=0.8.5", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding/ recoding. Decoding is not yet parallelized. //! //! rand = { version = "=0.9.1" } # Required for random number generation //! ```