Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "rlnc"
version = "0.8.4"
version = "0.8.5"
edition = "2024"
resolver = "3"
rust-version = "1.89.0"
Expand All @@ -20,7 +20,7 @@ categories = ["network-programming", "encoding", "algorithms"]


[dependencies]
rand = "=0.9.1"
rand = "=0.9.2"
rayon = { version = "=1.10.0", optional = true }

[dev-dependencies]
Expand Down
33 changes: 19 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ Blazing Fast Erasure-Coding with Random Linear Network Coding (RLNC)

## Introduction

`rlnc` is a Rust library crate that implements an advanced erasure-coding technique Random Linear Network Coding (RLNC) over galois field $GF(2^8)$ with irreducible polynomial $x^8 + x^4 + x^3 + x^2 + 1$. This library provides functionalities for blazing fast erasure-coding of data, reconstructing original data from coded pieces, and recoding existing coded pieces to new erasure-coded pieces, without ever decoding it back to original data, using AVX512, AVX2 and SSSE3 intrinsics on `x86_64` and NEON intrinsics on `arm64`, for fast vector multiplication by a single scalar over $GF(2^8)$.
`rlnc` is a Rust library crate that implements an advanced erasure-coding technique Random Linear Network Coding (RLNC) over galois field $GF(2^8)$ with irreducible polynomial $x^8 + x^4 + x^3 + x + 1$. This library provides functionalities for blazing fast erasure-coding of data, reconstructing original data from coded pieces, and recoding existing coded pieces to new erasure-coded pieces, without ever decoding it back to original data. It performs runtime introspection of platform and uses the best of GFNI, AVX512, AVX2 and SSSE3 intrinsics on `x86_64` and NEON intrinsics on `arm64`, for fast vector multiplication by a single scalar over $GF(2^8)$.

Following charts show performance of RLNC encoder, recoder and decoder on **AWS EC2 `m7a.large` with AMD EPYC 9R14** - which has AVX512 support. More performance benchmark results [below](#benchmarking).
Following charts show performance of RLNC encoder, recoder and decoder on **AWS EC2 `m7a.large` with AMD EPYC 9R14** - which has GFNI + AVX512 support. More performance benchmark results [below](#benchmarking).

![rlnc-encoder-on-x86_64_with-amd-avx512](./plots/rlnc-encoder-on-x86_64_with-amd-avx512.png)
![rlnc-encoder-on-x86_64_with-amd-gfni](./plots/rlnc-encoder-on-x86_64_with-amd-gfni.png)

![rlnc-recoder-on-x86_64_with-amd-avx512](./plots/rlnc-recoder-on-x86_64_with-amd-avx512.png)
![rlnc-recoder-on-x86_64_with-amd-gfni](./plots/rlnc-recoder-on-x86_64_with-amd-gfni.png)

![rlnc-decoder-on-x86_64_with-amd-avx512](./plots/rlnc-decoder-on-x86_64_with-amd-avx512.png)
![rlnc-decoder-on-x86_64_with-amd-gfni](./plots/rlnc-decoder-on-x86_64_with-amd-gfni.png)

---
**Let's take a practical example of how RLNC can be useful.**
Expand Down Expand Up @@ -116,16 +116,20 @@ Coverage Results:
|| Tested/Total Lines:
|| src/common/errors.rs: 0/1
|| src/common/gf256.rs: 9/11
|| src/common/simd/mod.rs: 6/9
|| src/common/simd/mod.rs: 8/12
|| src/common/simd/x86/avx2.rs: 10/10
|| src/common/simd/x86/mod.rs: 6/15
|| src/common/simd/x86/avx512.rs: 0/10
|| src/common/simd/x86/gfni/m128i.rs: 0/5
|| src/common/simd/x86/gfni/m256i.rs: 0/5
|| src/common/simd/x86/gfni/m512i.rs: 0/5
|| src/common/simd/x86/mod.rs: 18/33
|| src/common/simd/x86/ssse3.rs: 0/10
|| src/full/decoder.rs: 25/32
|| src/full/decoder.rs: 26/31
|| src/full/decoder_matrix.rs: 51/58
|| src/full/encoder.rs: 24/27
|| src/full/recoder.rs: 28/36
|| src/full/encoder.rs: 25/33
|| src/full/recoder.rs: 27/39
||
76.08% coverage, 159/209 lines covered
66.16% coverage, 174/263 lines covered
```

This will create an HTML coverage report at `tarpaulin-report.html` that you can open in your web browser to view detailed line-by-line coverage information for all source files.
Expand Down Expand Up @@ -3781,11 +3785,11 @@ To use `rlnc` library crate in your Rust project, add it as a dependency in your

```toml
[dependencies]
rlnc = "=0.8.4" # On x86_64 and aarch64 targets, it offers fast encoding, recoding and decoding, using SIMD intrinsics.
rlnc = "=0.8.5" # On x86_64 and aarch64 targets, it offers fast encoding, recoding and decoding, using SIMD intrinsics.
# or
rlnc = { version = "=0.8.4", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding and recoding. Note, this feature, doesn't yet parallelize RLNC decoding.
rlnc = { version = "=0.8.5", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding and recoding. Note, this feature, doesn't yet parallelize RLNC decoding.

rand = { version = "=0.9.1" } # Required for random number generation
rand = { version = "=0.9.2" } # Required for random number generation
```

### Full RLNC Workflow Example
Expand All @@ -3806,6 +3810,7 @@ See [full_rlnc.rs](./examples/full_rlnc.rs) example program. Run the program wit

```bash
Initialized Encoder with 10240 bytes of data, split into 32 pieces, each of 321 bytes. Each coded piece will be of 353 bytes.
Overhead of encoding: 10.31%
Initializing Decoder, expecting 32 original pieces of 321 bytes each.

Sender generating 16 initial coded pieces...
Expand Down
10 changes: 9 additions & 1 deletion plots/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
```bash
# This patch was generated on git tag v0.8.4 of this crate
# If the benchmark files are not touched after that, this patch should work.
git apply plots/scripts/visualize-plot-bench-result.patch
git apply plots/visualize-plot-bench-result.patch
```

4. Run benchmark program and collect console output.
Expand Down Expand Up @@ -71,3 +71,11 @@ All scripts are inside [scripts](./scripts) directory.
![rlnc-recoder-on-x86_64-with-amd-avx512](./rlnc-recoder-on-x86_64_with-amd-avx512.png)

![rlnc-decoder-on-x86_64-with-amd-avx512](./rlnc-decoder-on-x86_64_with-amd-avx512.png)

## Performance Benchmarking on AMD x86_64 with GFNI+AV512

![rlnc-encoder-on-x86_64-with-amd-gfni](./rlnc-encoder-on-x86_64_with-amd-gfni.png)

![rlnc-recoder-on-x86_64-with-amd-gfni](./rlnc-recoder-on-x86_64_with-amd-gfni.png)

![rlnc-decoder-on-x86_64-with-amd-gfni](./rlnc-decoder-on-x86_64_with-amd-gfni.png)
Binary file added plots/rlnc-decoder-on-x86_64_with-amd-gfni.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added plots/rlnc-encoder-on-x86_64_with-amd-gfni.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added plots/rlnc-recoder-on-x86_64_with-amd-gfni.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
56 changes: 28 additions & 28 deletions src/common/gf256.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,41 +14,41 @@ pub const GF256_BIT_WIDTH: usize = u8::BITS as usize;
pub const GF256_HALF_ORDER: usize = 1usize << (GF256_BIT_WIDTH / 2);

const GF256_LOG_TABLE: [u8; GF256_ORDER] = [
0, 0, 1, 25, 2, 50, 26, 198, 3, 223, 51, 238, 27, 104, 199, 75, 4, 100, 224, 14, 52, 141, 239, 129, 28, 193, 105, 248, 200, 8, 76, 113, 5, 138, 101, 47,
225, 36, 15, 33, 53, 147, 142, 218, 240, 18, 130, 69, 29, 181, 194, 125, 106, 39, 249, 185, 201, 154, 9, 120, 77, 228, 114, 166, 6, 191, 139, 98, 102, 221,
48, 253, 226, 152, 37, 179, 16, 145, 34, 136, 54, 208, 148, 206, 143, 150, 219, 189, 241, 210, 19, 92, 131, 56, 70, 64, 30, 66, 182, 163, 195, 72, 126,
110, 107, 58, 40, 84, 250, 133, 186, 61, 202, 94, 155, 159, 10, 21, 121, 43, 78, 212, 229, 172, 115, 243, 167, 87, 7, 112, 192, 247, 140, 128, 99, 13, 103,
74, 222, 237, 49, 197, 254, 24, 227, 165, 153, 119, 38, 184, 180, 124, 17, 68, 146, 217, 35, 32, 137, 46, 55, 63, 209, 91, 149, 188, 207, 205, 144, 135,
151, 178, 220, 252, 190, 97, 242, 86, 211, 171, 20, 42, 93, 158, 132, 60, 57, 83, 71, 109, 65, 162, 31, 45, 67, 216, 183, 123, 164, 118, 196, 23, 73, 236,
127, 12, 111, 246, 108, 161, 59, 82, 41, 157, 85, 170, 251, 96, 134, 177, 187, 204, 62, 90, 203, 89, 95, 176, 156, 169, 160, 81, 11, 245, 22, 235, 122,
117, 44, 215, 79, 174, 213, 233, 230, 231, 173, 232, 116, 214, 244, 234, 168, 80, 88, 175,
0, 0, 25, 1, 50, 2, 26, 198, 75, 199, 27, 104, 51, 238, 223, 3, 100, 4, 224, 14, 52, 141, 129, 239, 76, 113, 8, 200, 248, 105, 28, 193, 125, 194, 29, 181,
249, 185, 39, 106, 77, 228, 166, 114, 154, 201, 9, 120, 101, 47, 138, 5, 33, 15, 225, 36, 18, 240, 130, 69, 53, 147, 218, 142, 150, 143, 219, 189, 54, 208,
206, 148, 19, 92, 210, 241, 64, 70, 131, 56, 102, 221, 253, 48, 191, 6, 139, 98, 179, 37, 226, 152, 34, 136, 145, 16, 126, 110, 72, 195, 163, 182, 30, 66,
58, 107, 40, 84, 250, 133, 61, 186, 43, 121, 10, 21, 155, 159, 94, 202, 78, 212, 172, 229, 243, 115, 167, 87, 175, 88, 168, 80, 244, 234, 214, 116, 79,
174, 233, 213, 231, 230, 173, 232, 44, 215, 117, 122, 235, 22, 11, 245, 89, 203, 95, 176, 156, 169, 81, 160, 127, 12, 246, 111, 23, 196, 73, 236, 216, 67,
31, 45, 164, 118, 123, 183, 204, 187, 62, 90, 251, 96, 177, 134, 59, 82, 161, 108, 170, 85, 41, 157, 151, 178, 135, 144, 97, 190, 220, 252, 188, 149, 207,
205, 55, 63, 91, 209, 83, 57, 132, 60, 65, 162, 109, 71, 20, 42, 158, 93, 86, 242, 211, 171, 68, 17, 146, 217, 35, 32, 46, 137, 180, 124, 184, 38, 119,
153, 227, 165, 103, 74, 237, 222, 197, 49, 254, 24, 13, 99, 140, 128, 192, 247, 112, 7,
];

const GF256_EXP_TABLE: [u8; 2 * GF256_ORDER - 2] = [
1, 2, 4, 8, 16, 32, 64, 128, 29, 58, 116, 232, 205, 135, 19, 38, 76, 152, 45, 90, 180, 117, 234, 201, 143, 3, 6, 12, 24, 48, 96, 192, 157, 39, 78, 156, 37,
74, 148, 53, 106, 212, 181, 119, 238, 193, 159, 35, 70, 140, 5, 10, 20, 40, 80, 160, 93, 186, 105, 210, 185, 111, 222, 161, 95, 190, 97, 194, 153, 47, 94,
188, 101, 202, 137, 15, 30, 60, 120, 240, 253, 231, 211, 187, 107, 214, 177, 127, 254, 225, 223, 163, 91, 182, 113, 226, 217, 175, 67, 134, 17, 34, 68,
136, 13, 26, 52, 104, 208, 189, 103, 206, 129, 31, 62, 124, 248, 237, 199, 147, 59, 118, 236, 197, 151, 51, 102, 204, 133, 23, 46, 92, 184, 109, 218, 169,
79, 158, 33, 66, 132, 21, 42, 84, 168, 77, 154, 41, 82, 164, 85, 170, 73, 146, 57, 114, 228, 213, 183, 115, 230, 209, 191, 99, 198, 145, 63, 126, 252, 229,
215, 179, 123, 246, 241, 255, 227, 219, 171, 75, 150, 49, 98, 196, 149, 55, 110, 220, 165, 87, 174, 65, 130, 25, 50, 100, 200, 141, 7, 14, 28, 56, 112,
224, 221, 167, 83, 166, 81, 162, 89, 178, 121, 242, 249, 239, 195, 155, 43, 86, 172, 69, 138, 9, 18, 36, 72, 144, 61, 122, 244, 245, 247, 243, 251, 235,
203, 139, 11, 22, 44, 88, 176, 125, 250, 233, 207, 131, 27, 54, 108, 216, 173, 71, 142, 1, 2, 4, 8, 16, 32, 64, 128, 29, 58, 116, 232, 205, 135, 19, 38,
76, 152, 45, 90, 180, 117, 234, 201, 143, 3, 6, 12, 24, 48, 96, 192, 157, 39, 78, 156, 37, 74, 148, 53, 106, 212, 181, 119, 238, 193, 159, 35, 70, 140, 5,
10, 20, 40, 80, 160, 93, 186, 105, 210, 185, 111, 222, 161, 95, 190, 97, 194, 153, 47, 94, 188, 101, 202, 137, 15, 30, 60, 120, 240, 253, 231, 211, 187,
107, 214, 177, 127, 254, 225, 223, 163, 91, 182, 113, 226, 217, 175, 67, 134, 17, 34, 68, 136, 13, 26, 52, 104, 208, 189, 103, 206, 129, 31, 62, 124, 248,
237, 199, 147, 59, 118, 236, 197, 151, 51, 102, 204, 133, 23, 46, 92, 184, 109, 218, 169, 79, 158, 33, 66, 132, 21, 42, 84, 168, 77, 154, 41, 82, 164, 85,
170, 73, 146, 57, 114, 228, 213, 183, 115, 230, 209, 191, 99, 198, 145, 63, 126, 252, 229, 215, 179, 123, 246, 241, 255, 227, 219, 171, 75, 150, 49, 98,
196, 149, 55, 110, 220, 165, 87, 174, 65, 130, 25, 50, 100, 200, 141, 7, 14, 28, 56, 112, 224, 221, 167, 83, 166, 81, 162, 89, 178, 121, 242, 249, 239,
195, 155, 43, 86, 172, 69, 138, 9, 18, 36, 72, 144, 61, 122, 244, 245, 247, 243, 251, 235, 203, 139, 11, 22, 44, 88, 176, 125, 250, 233, 207, 131, 27, 54,
108, 216, 173, 71, 142,
1, 3, 5, 15, 17, 51, 85, 255, 26, 46, 114, 150, 161, 248, 19, 53, 95, 225, 56, 72, 216, 115, 149, 164, 247, 2, 6, 10, 30, 34, 102, 170, 229, 52, 92, 228,
55, 89, 235, 38, 106, 190, 217, 112, 144, 171, 230, 49, 83, 245, 4, 12, 20, 60, 68, 204, 79, 209, 104, 184, 211, 110, 178, 205, 76, 212, 103, 169, 224, 59,
77, 215, 98, 166, 241, 8, 24, 40, 120, 136, 131, 158, 185, 208, 107, 189, 220, 127, 129, 152, 179, 206, 73, 219, 118, 154, 181, 196, 87, 249, 16, 48, 80,
240, 11, 29, 39, 105, 187, 214, 97, 163, 254, 25, 43, 125, 135, 146, 173, 236, 47, 113, 147, 174, 233, 32, 96, 160, 251, 22, 58, 78, 210, 109, 183, 194,
93, 231, 50, 86, 250, 21, 63, 65, 195, 94, 226, 61, 71, 201, 64, 192, 91, 237, 44, 116, 156, 191, 218, 117, 159, 186, 213, 100, 172, 239, 42, 126, 130,
157, 188, 223, 122, 142, 137, 128, 155, 182, 193, 88, 232, 35, 101, 175, 234, 37, 111, 177, 200, 67, 197, 84, 252, 31, 33, 99, 165, 244, 7, 9, 27, 45, 119,
153, 176, 203, 70, 202, 69, 207, 74, 222, 121, 139, 134, 145, 168, 227, 62, 66, 198, 81, 243, 14, 18, 54, 90, 238, 41, 123, 141, 140, 143, 138, 133, 148,
167, 242, 13, 23, 57, 75, 221, 124, 132, 151, 162, 253, 28, 36, 108, 180, 199, 82, 246, 1, 3, 5, 15, 17, 51, 85, 255, 26, 46, 114, 150, 161, 248, 19, 53,
95, 225, 56, 72, 216, 115, 149, 164, 247, 2, 6, 10, 30, 34, 102, 170, 229, 52, 92, 228, 55, 89, 235, 38, 106, 190, 217, 112, 144, 171, 230, 49, 83, 245, 4,
12, 20, 60, 68, 204, 79, 209, 104, 184, 211, 110, 178, 205, 76, 212, 103, 169, 224, 59, 77, 215, 98, 166, 241, 8, 24, 40, 120, 136, 131, 158, 185, 208,
107, 189, 220, 127, 129, 152, 179, 206, 73, 219, 118, 154, 181, 196, 87, 249, 16, 48, 80, 240, 11, 29, 39, 105, 187, 214, 97, 163, 254, 25, 43, 125, 135,
146, 173, 236, 47, 113, 147, 174, 233, 32, 96, 160, 251, 22, 58, 78, 210, 109, 183, 194, 93, 231, 50, 86, 250, 21, 63, 65, 195, 94, 226, 61, 71, 201, 64,
192, 91, 237, 44, 116, 156, 191, 218, 117, 159, 186, 213, 100, 172, 239, 42, 126, 130, 157, 188, 223, 122, 142, 137, 128, 155, 182, 193, 88, 232, 35, 101,
175, 234, 37, 111, 177, 200, 67, 197, 84, 252, 31, 33, 99, 165, 244, 7, 9, 27, 45, 119, 153, 176, 203, 70, 202, 69, 207, 74, 222, 121, 139, 134, 145, 168,
227, 62, 66, 198, 81, 243, 14, 18, 54, 90, 238, 41, 123, 141, 140, 143, 138, 133, 148, 167, 242, 13, 23, 57, 75, 221, 124, 132, 151, 162, 253, 28, 36, 108,
180, 199, 82, 246,
];

/// Galois Field GF(2^8) wrapper type.
///
/// This type represents elements of the Galois Field GF(2^8), which is commonly used in coding theory, cryptography, and error correction codes.
/// It supports basic arithmetic operations such as addition, subtraction, multiplication, and division.
/// The operations are defined over the finite field GF(2^8) with the irreducible polynomial x^8 + x^4 + x^3 + x^2 + 1
/// and the primitive element x = 2.
/// The operations are defined over the finite field GF(2^8) with the irreducible polynomial x^8 + x^4 + x^3 + x + 1
/// and the primitive element x = 3.
///
/// We assign the `transparent` attribute to ensure that the Rust compiler representation of `Gf256` is the same as its underlying `u8` value,
/// providing a guarantee that it can be used interchangeably with `u8` in contexts where the underlying value is needed.
Expand Down Expand Up @@ -79,9 +79,9 @@ impl Gf256 {
Gf256::new(1)
}

/// Returns primitive element x, for GF(2^8) field with irreducible polynomial x^8 + x^4 + x^3 + x^2 + 1.
/// Returns primitive element x + 1, for GF(2^8) field with irreducible polynomial x^8 + x^4 + x^3 + x + 1.
pub const fn primitive_element() -> Self {
Gf256::new(2)
Gf256::new(3)
}

/// Compile-time executable multiplication of two bytes, over GF(2^8).
Expand Down
54 changes: 54 additions & 0 deletions src/common/simd/x86/gfni/m128i.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
use crate::common::gf256::{GF256_HALF_ORDER, Gf256};

#[cfg(target_arch = "x86")]
use std::arch::x86::*;

#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

#[target_feature(enable = "gfni", enable = "avx512vl")]
pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) {
let mut iter = vec.chunks_exact_mut(2 * GF256_HALF_ORDER);

unsafe {
let scalar_simd = _mm_set1_epi8(scalar as i8);
for chunk in iter.by_ref() {
let chunk_simd = _mm_loadu_si128(chunk.as_ptr().cast());
let res = _mm_gf2p8mul_epi8(chunk_simd, scalar_simd);

_mm_storeu_si128(chunk.as_mut_ptr().cast(), res);
}
}

iter.into_remainder().iter_mut().for_each(|symbol| {
*symbol = Gf256::mul_const(*symbol, scalar);
});
}

#[target_feature(enable = "gfni", enable = "avx512vl")]
pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) {
let mut add_vec_iter = add_into_vec.chunks_exact_mut(2 * GF256_HALF_ORDER);
let mut mul_vec_iter = mul_vec.chunks_exact(2 * GF256_HALF_ORDER);

unsafe {
let scalar_simd = _mm_set1_epi8(scalar as i8);

for (add_vec_chunk, mul_vec_chunk) in add_vec_iter.by_ref().zip(mul_vec_iter.by_ref()) {
let mul_vec_chunk_simd = _mm_loadu_si128(mul_vec_chunk.as_ptr().cast());
let scaled_res = _mm_gf2p8mul_epi8(mul_vec_chunk_simd, scalar_simd);

let add_vec_chunk_simd = _mm_loadu_si128(add_vec_chunk.as_ptr().cast());
let accum_res = _mm_xor_si128(add_vec_chunk_simd, scaled_res);

_mm_storeu_si128(add_vec_chunk.as_mut_ptr().cast(), accum_res);
}
}

add_vec_iter
.into_remainder()
.iter_mut()
.zip(mul_vec_iter.remainder().iter().map(|&src_symbol| Gf256::mul_const(src_symbol, scalar)))
.for_each(|(res, scaled)| {
*res ^= scaled;
});
}
54 changes: 54 additions & 0 deletions src/common/simd/x86/gfni/m256i.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
use crate::common::gf256::{GF256_HALF_ORDER, Gf256};

#[cfg(target_arch = "x86")]
use std::arch::x86::*;

#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

#[target_feature(enable = "gfni", enable = "avx512vl")]
pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) {
let mut iter = vec.chunks_exact_mut(2 * GF256_HALF_ORDER);

unsafe {
let scalar_simd = _mm256_set1_epi8(scalar as i8);
for chunk in iter.by_ref() {
let chunk_simd = _mm256_loadu_si256(chunk.as_ptr().cast());
let res = _mm256_gf2p8mul_epi8(chunk_simd, scalar_simd);

_mm256_storeu_si256(chunk.as_mut_ptr().cast(), res);
}
}

iter.into_remainder().iter_mut().for_each(|symbol| {
*symbol = Gf256::mul_const(*symbol, scalar);
});
}

#[target_feature(enable = "gfni", enable = "avx512vl")]
pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) {
let mut add_vec_iter = add_into_vec.chunks_exact_mut(2 * GF256_HALF_ORDER);
let mut mul_vec_iter = mul_vec.chunks_exact(2 * GF256_HALF_ORDER);

unsafe {
let scalar_simd = _mm256_set1_epi8(scalar as i8);

for (add_vec_chunk, mul_vec_chunk) in add_vec_iter.by_ref().zip(mul_vec_iter.by_ref()) {
let mul_vec_chunk_simd = _mm256_loadu_si256(mul_vec_chunk.as_ptr().cast());
let scaled_res = _mm256_gf2p8mul_epi8(mul_vec_chunk_simd, scalar_simd);

let add_vec_chunk_simd = _mm256_loadu_si256(add_vec_chunk.as_ptr().cast());
let accum_res = _mm256_xor_si256(add_vec_chunk_simd, scaled_res);

_mm256_storeu_si256(add_vec_chunk.as_mut_ptr().cast(), accum_res);
}
}

add_vec_iter
.into_remainder()
.iter_mut()
.zip(mul_vec_iter.remainder().iter().map(|&src_symbol| Gf256::mul_const(src_symbol, scalar)))
.for_each(|(res, scaled)| {
*res ^= scaled;
});
}
Loading
Loading