diff --git a/Cargo.lock b/Cargo.lock
index b11b7ee..8364a60 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -177,9 +177,9 @@ checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
 
 [[package]]
 name = "rand"
-version = "0.9.1"
+version = "0.9.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
+checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
 dependencies = [
  "rand_chacha",
  "rand_core",
@@ -232,7 +232,7 @@ checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a"
 
 [[package]]
 name = "rlnc"
-version = "0.8.4"
+version = "0.8.5"
 dependencies = [
  "divan",
  "rand",
diff --git a/Cargo.toml b/Cargo.toml
index 39d5794..d686e53 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "rlnc"
-version = "0.8.4"
+version = "0.8.5"
 edition = "2024"
 resolver = "3"
 rust-version = "1.89.0"
@@ -20,7 +20,7 @@ categories = ["network-programming", "encoding", "algorithms"]
 
 
 [dependencies]
-rand = "=0.9.1"
+rand = "=0.9.2"
 rayon = { version = "=1.10.0", optional = true }
 
 [dev-dependencies]
diff --git a/README.md b/README.md
index 7ce39b3..c7d63f0 100644
--- a/README.md
+++ b/README.md
@@ -4,15 +4,15 @@ Blazing Fast Erasure-Coding with Random Linear Network Coding (RLNC)
 
 ## Introduction
 
-`rlnc` is a Rust library crate that implements an advanced erasure-coding technique Random Linear Network Coding (RLNC) over galois field $GF(2^8)$ with irreducible polynomial $x^8 + x^4 + x^3 + x^2 + 1$. This library provides functionalities for blazing fast erasure-coding of data, reconstructing original data from coded pieces, and recoding existing coded pieces to new erasure-coded pieces, without ever decoding it back to original data, using AVX512, AVX2 and SSSE3 intrinsics on `x86_64` and NEON intrinsics on `arm64`, for fast vector multiplication by a single scalar over $GF(2^8)$.
+`rlnc` is a Rust library crate that implements an advanced erasure-coding technique Random Linear Network Coding (RLNC) over galois field $GF(2^8)$ with irreducible polynomial $x^8 + x^4 + x^3 + x + 1$. This library provides functionalities for blazing fast erasure-coding of data, reconstructing original data from coded pieces, and recoding existing coded pieces to new erasure-coded pieces, without ever decoding it back to original data. It performs runtime introspection of platform and uses the best of GFNI, AVX512, AVX2 and SSSE3 intrinsics on `x86_64` and NEON intrinsics on `arm64`, for fast vector multiplication by a single scalar over $GF(2^8)$.
 
-Following charts show performance of RLNC encoder, recoder and decoder on **AWS EC2 `m7a.large` with AMD EPYC 9R14** - which has AVX512 support. More performance benchmark results [below](#benchmarking).
+Following charts show performance of RLNC encoder, recoder and decoder on **AWS EC2 `m7a.large` with AMD EPYC 9R14** - which has GFNI + AVX512 support. More performance benchmark results [below](#benchmarking).
 
-![rlnc-encoder-on-x86_64_with-amd-avx512](./plots/rlnc-encoder-on-x86_64_with-amd-avx512.png)
+![rlnc-encoder-on-x86_64_with-amd-gfni](./plots/rlnc-encoder-on-x86_64_with-amd-gfni.png)
 
-![rlnc-recoder-on-x86_64_with-amd-avx512](./plots/rlnc-recoder-on-x86_64_with-amd-avx512.png)
+![rlnc-recoder-on-x86_64_with-amd-gfni](./plots/rlnc-recoder-on-x86_64_with-amd-gfni.png)
 
-![rlnc-decoder-on-x86_64_with-amd-avx512](./plots/rlnc-decoder-on-x86_64_with-amd-avx512.png)
+![rlnc-decoder-on-x86_64_with-amd-gfni](./plots/rlnc-decoder-on-x86_64_with-amd-gfni.png)
 
 ---
 **Let's take a practical example of how RLNC can be useful.**
@@ -116,16 +116,20 @@ Coverage Results:
 || Tested/Total Lines:
 || src/common/errors.rs: 0/1
 || src/common/gf256.rs: 9/11
-|| src/common/simd/mod.rs: 6/9
+|| src/common/simd/mod.rs: 8/12
 || src/common/simd/x86/avx2.rs: 10/10
-|| src/common/simd/x86/mod.rs: 6/15
+|| src/common/simd/x86/avx512.rs: 0/10
+|| src/common/simd/x86/gfni/m128i.rs: 0/5
+|| src/common/simd/x86/gfni/m256i.rs: 0/5
+|| src/common/simd/x86/gfni/m512i.rs: 0/5
+|| src/common/simd/x86/mod.rs: 18/33
 || src/common/simd/x86/ssse3.rs: 0/10
-|| src/full/decoder.rs: 25/32
+|| src/full/decoder.rs: 26/31
 || src/full/decoder_matrix.rs: 51/58
-|| src/full/encoder.rs: 24/27
-|| src/full/recoder.rs: 28/36
+|| src/full/encoder.rs: 25/33
+|| src/full/recoder.rs: 27/39
 ||
-76.08% coverage, 159/209 lines covered
+66.16% coverage, 174/263 lines covered
 ```
 
 This will create an HTML coverage report at `tarpaulin-report.html` that you can open in your web browser to view detailed line-by-line coverage information for all source files.
@@ -3781,11 +3785,11 @@ To use `rlnc` library crate in your Rust project, add it as a dependency in your
 
 ```toml
 [dependencies]
-rlnc = "=0.8.4"                                      # On x86_64 and aarch64 targets, it offers fast encoding, recoding and decoding, using SIMD intrinsics.
+rlnc = "=0.8.5"                                      # On x86_64 and aarch64 targets, it offers fast encoding, recoding and decoding, using SIMD intrinsics.
 # or
-rlnc = { version = "=0.8.4", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding and recoding. Note, this feature, doesn't yet parallelize RLNC decoding.
+rlnc = { version = "=0.8.5", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding and recoding. Note, this feature, doesn't yet parallelize RLNC decoding.
 
-rand = { version = "=0.9.1" } # Required for random number generation
+rand = { version = "=0.9.2" } # Required for random number generation
 ```
 
 ### Full RLNC Workflow Example
@@ -3806,6 +3810,7 @@ See [full_rlnc.rs](./examples/full_rlnc.rs) example program. Run the program wit
 
 ```bash
 Initialized Encoder with 10240 bytes of data, split into 32 pieces, each of 321 bytes. Each coded piece will be of 353 bytes.
+Overhead of encoding: 10.31%
 Initializing Decoder, expecting 32 original pieces of 321 bytes each.
 
 Sender generating 16 initial coded pieces...
diff --git a/plots/README.md b/plots/README.md
index aa94f04..3e350c4 100644
--- a/plots/README.md
+++ b/plots/README.md
@@ -21,7 +21,7 @@
     ```bash
     # This patch was generated on git tag v0.8.4 of this crate
     # If the benchmark files are not touched after that, this patch should work.
-    git apply plots/scripts/visualize-plot-bench-result.patch
+    git apply plots/visualize-plot-bench-result.patch
     ```
 
 4. Run benchmark program and collect console output.
@@ -71,3 +71,11 @@ All scripts are inside [scripts](./scripts) directory.
 ![rlnc-recoder-on-x86_64-with-amd-avx512](./rlnc-recoder-on-x86_64_with-amd-avx512.png)
 
 ![rlnc-decoder-on-x86_64-with-amd-avx512](./rlnc-decoder-on-x86_64_with-amd-avx512.png)
+
+## Performance Benchmarking on AMD x86_64 with GFNI+AV512
+
+![rlnc-encoder-on-x86_64-with-amd-gfni](./rlnc-encoder-on-x86_64_with-amd-gfni.png)
+
+![rlnc-recoder-on-x86_64-with-amd-gfni](./rlnc-recoder-on-x86_64_with-amd-gfni.png)
+
+![rlnc-decoder-on-x86_64-with-amd-gfni](./rlnc-decoder-on-x86_64_with-amd-gfni.png)
diff --git a/plots/rlnc-decoder-on-x86_64_with-amd-gfni.png b/plots/rlnc-decoder-on-x86_64_with-amd-gfni.png
new file mode 100644
index 0000000..2d31a9b
Binary files /dev/null and b/plots/rlnc-decoder-on-x86_64_with-amd-gfni.png differ
diff --git a/plots/rlnc-encoder-on-x86_64_with-amd-gfni.png b/plots/rlnc-encoder-on-x86_64_with-amd-gfni.png
new file mode 100644
index 0000000..2829d52
Binary files /dev/null and b/plots/rlnc-encoder-on-x86_64_with-amd-gfni.png differ
diff --git a/plots/rlnc-recoder-on-x86_64_with-amd-gfni.png b/plots/rlnc-recoder-on-x86_64_with-amd-gfni.png
new file mode 100644
index 0000000..03ddee3
Binary files /dev/null and b/plots/rlnc-recoder-on-x86_64_with-amd-gfni.png differ
diff --git a/src/common/gf256.rs b/src/common/gf256.rs
index 498f456..e57882a 100644
--- a/src/common/gf256.rs
+++ b/src/common/gf256.rs
@@ -14,41 +14,41 @@ pub const GF256_BIT_WIDTH: usize = u8::BITS as usize;
 pub const GF256_HALF_ORDER: usize = 1usize << (GF256_BIT_WIDTH / 2);
 
 const GF256_LOG_TABLE: [u8; GF256_ORDER] = [
-    0, 0, 1, 25, 2, 50, 26, 198, 3, 223, 51, 238, 27, 104, 199, 75, 4, 100, 224, 14, 52, 141, 239, 129, 28, 193, 105, 248, 200, 8, 76, 113, 5, 138, 101, 47,
-    225, 36, 15, 33, 53, 147, 142, 218, 240, 18, 130, 69, 29, 181, 194, 125, 106, 39, 249, 185, 201, 154, 9, 120, 77, 228, 114, 166, 6, 191, 139, 98, 102, 221,
-    48, 253, 226, 152, 37, 179, 16, 145, 34, 136, 54, 208, 148, 206, 143, 150, 219, 189, 241, 210, 19, 92, 131, 56, 70, 64, 30, 66, 182, 163, 195, 72, 126,
-    110, 107, 58, 40, 84, 250, 133, 186, 61, 202, 94, 155, 159, 10, 21, 121, 43, 78, 212, 229, 172, 115, 243, 167, 87, 7, 112, 192, 247, 140, 128, 99, 13, 103,
-    74, 222, 237, 49, 197, 254, 24, 227, 165, 153, 119, 38, 184, 180, 124, 17, 68, 146, 217, 35, 32, 137, 46, 55, 63, 209, 91, 149, 188, 207, 205, 144, 135,
-    151, 178, 220, 252, 190, 97, 242, 86, 211, 171, 20, 42, 93, 158, 132, 60, 57, 83, 71, 109, 65, 162, 31, 45, 67, 216, 183, 123, 164, 118, 196, 23, 73, 236,
-    127, 12, 111, 246, 108, 161, 59, 82, 41, 157, 85, 170, 251, 96, 134, 177, 187, 204, 62, 90, 203, 89, 95, 176, 156, 169, 160, 81, 11, 245, 22, 235, 122,
-    117, 44, 215, 79, 174, 213, 233, 230, 231, 173, 232, 116, 214, 244, 234, 168, 80, 88, 175,
+    0, 0, 25, 1, 50, 2, 26, 198, 75, 199, 27, 104, 51, 238, 223, 3, 100, 4, 224, 14, 52, 141, 129, 239, 76, 113, 8, 200, 248, 105, 28, 193, 125, 194, 29, 181,
+    249, 185, 39, 106, 77, 228, 166, 114, 154, 201, 9, 120, 101, 47, 138, 5, 33, 15, 225, 36, 18, 240, 130, 69, 53, 147, 218, 142, 150, 143, 219, 189, 54, 208,
+    206, 148, 19, 92, 210, 241, 64, 70, 131, 56, 102, 221, 253, 48, 191, 6, 139, 98, 179, 37, 226, 152, 34, 136, 145, 16, 126, 110, 72, 195, 163, 182, 30, 66,
+    58, 107, 40, 84, 250, 133, 61, 186, 43, 121, 10, 21, 155, 159, 94, 202, 78, 212, 172, 229, 243, 115, 167, 87, 175, 88, 168, 80, 244, 234, 214, 116, 79,
+    174, 233, 213, 231, 230, 173, 232, 44, 215, 117, 122, 235, 22, 11, 245, 89, 203, 95, 176, 156, 169, 81, 160, 127, 12, 246, 111, 23, 196, 73, 236, 216, 67,
+    31, 45, 164, 118, 123, 183, 204, 187, 62, 90, 251, 96, 177, 134, 59, 82, 161, 108, 170, 85, 41, 157, 151, 178, 135, 144, 97, 190, 220, 252, 188, 149, 207,
+    205, 55, 63, 91, 209, 83, 57, 132, 60, 65, 162, 109, 71, 20, 42, 158, 93, 86, 242, 211, 171, 68, 17, 146, 217, 35, 32, 46, 137, 180, 124, 184, 38, 119,
+    153, 227, 165, 103, 74, 237, 222, 197, 49, 254, 24, 13, 99, 140, 128, 192, 247, 112, 7,
 ];
 
 const GF256_EXP_TABLE: [u8; 2 * GF256_ORDER - 2] = [
-    1, 2, 4, 8, 16, 32, 64, 128, 29, 58, 116, 232, 205, 135, 19, 38, 76, 152, 45, 90, 180, 117, 234, 201, 143, 3, 6, 12, 24, 48, 96, 192, 157, 39, 78, 156, 37,
-    74, 148, 53, 106, 212, 181, 119, 238, 193, 159, 35, 70, 140, 5, 10, 20, 40, 80, 160, 93, 186, 105, 210, 185, 111, 222, 161, 95, 190, 97, 194, 153, 47, 94,
-    188, 101, 202, 137, 15, 30, 60, 120, 240, 253, 231, 211, 187, 107, 214, 177, 127, 254, 225, 223, 163, 91, 182, 113, 226, 217, 175, 67, 134, 17, 34, 68,
-    136, 13, 26, 52, 104, 208, 189, 103, 206, 129, 31, 62, 124, 248, 237, 199, 147, 59, 118, 236, 197, 151, 51, 102, 204, 133, 23, 46, 92, 184, 109, 218, 169,
-    79, 158, 33, 66, 132, 21, 42, 84, 168, 77, 154, 41, 82, 164, 85, 170, 73, 146, 57, 114, 228, 213, 183, 115, 230, 209, 191, 99, 198, 145, 63, 126, 252, 229,
-    215, 179, 123, 246, 241, 255, 227, 219, 171, 75, 150, 49, 98, 196, 149, 55, 110, 220, 165, 87, 174, 65, 130, 25, 50, 100, 200, 141, 7, 14, 28, 56, 112,
-    224, 221, 167, 83, 166, 81, 162, 89, 178, 121, 242, 249, 239, 195, 155, 43, 86, 172, 69, 138, 9, 18, 36, 72, 144, 61, 122, 244, 245, 247, 243, 251, 235,
-    203, 139, 11, 22, 44, 88, 176, 125, 250, 233, 207, 131, 27, 54, 108, 216, 173, 71, 142, 1, 2, 4, 8, 16, 32, 64, 128, 29, 58, 116, 232, 205, 135, 19, 38,
-    76, 152, 45, 90, 180, 117, 234, 201, 143, 3, 6, 12, 24, 48, 96, 192, 157, 39, 78, 156, 37, 74, 148, 53, 106, 212, 181, 119, 238, 193, 159, 35, 70, 140, 5,
-    10, 20, 40, 80, 160, 93, 186, 105, 210, 185, 111, 222, 161, 95, 190, 97, 194, 153, 47, 94, 188, 101, 202, 137, 15, 30, 60, 120, 240, 253, 231, 211, 187,
-    107, 214, 177, 127, 254, 225, 223, 163, 91, 182, 113, 226, 217, 175, 67, 134, 17, 34, 68, 136, 13, 26, 52, 104, 208, 189, 103, 206, 129, 31, 62, 124, 248,
-    237, 199, 147, 59, 118, 236, 197, 151, 51, 102, 204, 133, 23, 46, 92, 184, 109, 218, 169, 79, 158, 33, 66, 132, 21, 42, 84, 168, 77, 154, 41, 82, 164, 85,
-    170, 73, 146, 57, 114, 228, 213, 183, 115, 230, 209, 191, 99, 198, 145, 63, 126, 252, 229, 215, 179, 123, 246, 241, 255, 227, 219, 171, 75, 150, 49, 98,
-    196, 149, 55, 110, 220, 165, 87, 174, 65, 130, 25, 50, 100, 200, 141, 7, 14, 28, 56, 112, 224, 221, 167, 83, 166, 81, 162, 89, 178, 121, 242, 249, 239,
-    195, 155, 43, 86, 172, 69, 138, 9, 18, 36, 72, 144, 61, 122, 244, 245, 247, 243, 251, 235, 203, 139, 11, 22, 44, 88, 176, 125, 250, 233, 207, 131, 27, 54,
-    108, 216, 173, 71, 142,
+    1, 3, 5, 15, 17, 51, 85, 255, 26, 46, 114, 150, 161, 248, 19, 53, 95, 225, 56, 72, 216, 115, 149, 164, 247, 2, 6, 10, 30, 34, 102, 170, 229, 52, 92, 228,
+    55, 89, 235, 38, 106, 190, 217, 112, 144, 171, 230, 49, 83, 245, 4, 12, 20, 60, 68, 204, 79, 209, 104, 184, 211, 110, 178, 205, 76, 212, 103, 169, 224, 59,
+    77, 215, 98, 166, 241, 8, 24, 40, 120, 136, 131, 158, 185, 208, 107, 189, 220, 127, 129, 152, 179, 206, 73, 219, 118, 154, 181, 196, 87, 249, 16, 48, 80,
+    240, 11, 29, 39, 105, 187, 214, 97, 163, 254, 25, 43, 125, 135, 146, 173, 236, 47, 113, 147, 174, 233, 32, 96, 160, 251, 22, 58, 78, 210, 109, 183, 194,
+    93, 231, 50, 86, 250, 21, 63, 65, 195, 94, 226, 61, 71, 201, 64, 192, 91, 237, 44, 116, 156, 191, 218, 117, 159, 186, 213, 100, 172, 239, 42, 126, 130,
+    157, 188, 223, 122, 142, 137, 128, 155, 182, 193, 88, 232, 35, 101, 175, 234, 37, 111, 177, 200, 67, 197, 84, 252, 31, 33, 99, 165, 244, 7, 9, 27, 45, 119,
+    153, 176, 203, 70, 202, 69, 207, 74, 222, 121, 139, 134, 145, 168, 227, 62, 66, 198, 81, 243, 14, 18, 54, 90, 238, 41, 123, 141, 140, 143, 138, 133, 148,
+    167, 242, 13, 23, 57, 75, 221, 124, 132, 151, 162, 253, 28, 36, 108, 180, 199, 82, 246, 1, 3, 5, 15, 17, 51, 85, 255, 26, 46, 114, 150, 161, 248, 19, 53,
+    95, 225, 56, 72, 216, 115, 149, 164, 247, 2, 6, 10, 30, 34, 102, 170, 229, 52, 92, 228, 55, 89, 235, 38, 106, 190, 217, 112, 144, 171, 230, 49, 83, 245, 4,
+    12, 20, 60, 68, 204, 79, 209, 104, 184, 211, 110, 178, 205, 76, 212, 103, 169, 224, 59, 77, 215, 98, 166, 241, 8, 24, 40, 120, 136, 131, 158, 185, 208,
+    107, 189, 220, 127, 129, 152, 179, 206, 73, 219, 118, 154, 181, 196, 87, 249, 16, 48, 80, 240, 11, 29, 39, 105, 187, 214, 97, 163, 254, 25, 43, 125, 135,
+    146, 173, 236, 47, 113, 147, 174, 233, 32, 96, 160, 251, 22, 58, 78, 210, 109, 183, 194, 93, 231, 50, 86, 250, 21, 63, 65, 195, 94, 226, 61, 71, 201, 64,
+    192, 91, 237, 44, 116, 156, 191, 218, 117, 159, 186, 213, 100, 172, 239, 42, 126, 130, 157, 188, 223, 122, 142, 137, 128, 155, 182, 193, 88, 232, 35, 101,
+    175, 234, 37, 111, 177, 200, 67, 197, 84, 252, 31, 33, 99, 165, 244, 7, 9, 27, 45, 119, 153, 176, 203, 70, 202, 69, 207, 74, 222, 121, 139, 134, 145, 168,
+    227, 62, 66, 198, 81, 243, 14, 18, 54, 90, 238, 41, 123, 141, 140, 143, 138, 133, 148, 167, 242, 13, 23, 57, 75, 221, 124, 132, 151, 162, 253, 28, 36, 108,
+    180, 199, 82, 246,
 ];
 
 /// Galois Field GF(2^8) wrapper type.
 ///
 /// This type represents elements of the Galois Field GF(2^8), which is commonly used in coding theory, cryptography, and error correction codes.
 /// It supports basic arithmetic operations such as addition, subtraction, multiplication, and division.
-/// The operations are defined over the finite field GF(2^8) with the irreducible polynomial x^8 + x^4 + x^3 + x^2 + 1
-/// and the primitive element x = 2.
+/// The operations are defined over the finite field GF(2^8) with the irreducible polynomial x^8 + x^4 + x^3 + x + 1
+/// and the primitive element x = 3.
 ///
 /// We assign the `transparent` attribute to ensure that the Rust compiler representation of `Gf256` is the same as its underlying `u8` value,
 /// providing a guarantee that it can be used interchangeably with `u8` in contexts where the underlying value is needed.
@@ -79,9 +79,9 @@ impl Gf256 {
         Gf256::new(1)
     }
 
-    /// Returns primitive element x, for GF(2^8) field with irreducible polynomial x^8 + x^4 + x^3 + x^2 + 1.
+    /// Returns primitive element x + 1, for GF(2^8) field with irreducible polynomial x^8 + x^4 + x^3 + x + 1.
     pub const fn primitive_element() -> Self {
-        Gf256::new(2)
+        Gf256::new(3)
     }
 
     /// Compile-time executable multiplication of two bytes, over GF(2^8).
diff --git a/src/common/simd/x86/gfni/m128i.rs b/src/common/simd/x86/gfni/m128i.rs
new file mode 100644
index 0000000..62283c3
--- /dev/null
+++ b/src/common/simd/x86/gfni/m128i.rs
@@ -0,0 +1,54 @@
+use crate::common::gf256::{GF256_HALF_ORDER, Gf256};
+
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[target_feature(enable = "gfni", enable = "avx512vl")]
+pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) {
+    let mut iter = vec.chunks_exact_mut(2 * GF256_HALF_ORDER);
+
+    unsafe {
+        let scalar_simd = _mm_set1_epi8(scalar as i8);
+        for chunk in iter.by_ref() {
+            let chunk_simd = _mm_loadu_si128(chunk.as_ptr().cast());
+            let res = _mm_gf2p8mul_epi8(chunk_simd, scalar_simd);
+
+            _mm_storeu_si128(chunk.as_mut_ptr().cast(), res);
+        }
+    }
+
+    iter.into_remainder().iter_mut().for_each(|symbol| {
+        *symbol = Gf256::mul_const(*symbol, scalar);
+    });
+}
+
+#[target_feature(enable = "gfni", enable = "avx512vl")]
+pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) {
+    let mut add_vec_iter = add_into_vec.chunks_exact_mut(2 * GF256_HALF_ORDER);
+    let mut mul_vec_iter = mul_vec.chunks_exact(2 * GF256_HALF_ORDER);
+
+    unsafe {
+        let scalar_simd = _mm_set1_epi8(scalar as i8);
+
+        for (add_vec_chunk, mul_vec_chunk) in add_vec_iter.by_ref().zip(mul_vec_iter.by_ref()) {
+            let mul_vec_chunk_simd = _mm_loadu_si128(mul_vec_chunk.as_ptr().cast());
+            let scaled_res = _mm_gf2p8mul_epi8(mul_vec_chunk_simd, scalar_simd);
+
+            let add_vec_chunk_simd = _mm_loadu_si128(add_vec_chunk.as_ptr().cast());
+            let accum_res = _mm_xor_si128(add_vec_chunk_simd, scaled_res);
+
+            _mm_storeu_si128(add_vec_chunk.as_mut_ptr().cast(), accum_res);
+        }
+    }
+
+    add_vec_iter
+        .into_remainder()
+        .iter_mut()
+        .zip(mul_vec_iter.remainder().iter().map(|&src_symbol| Gf256::mul_const(src_symbol, scalar)))
+        .for_each(|(res, scaled)| {
+            *res ^= scaled;
+        });
+}
diff --git a/src/common/simd/x86/gfni/m256i.rs b/src/common/simd/x86/gfni/m256i.rs
new file mode 100644
index 0000000..f101bcd
--- /dev/null
+++ b/src/common/simd/x86/gfni/m256i.rs
@@ -0,0 +1,54 @@
+use crate::common::gf256::{GF256_HALF_ORDER, Gf256};
+
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[target_feature(enable = "gfni", enable = "avx512vl")]
+pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) {
+    let mut iter = vec.chunks_exact_mut(2 * GF256_HALF_ORDER);
+
+    unsafe {
+        let scalar_simd = _mm256_set1_epi8(scalar as i8);
+        for chunk in iter.by_ref() {
+            let chunk_simd = _mm256_loadu_si256(chunk.as_ptr().cast());
+            let res = _mm256_gf2p8mul_epi8(chunk_simd, scalar_simd);
+
+            _mm256_storeu_si256(chunk.as_mut_ptr().cast(), res);
+        }
+    }
+
+    iter.into_remainder().iter_mut().for_each(|symbol| {
+        *symbol = Gf256::mul_const(*symbol, scalar);
+    });
+}
+
+#[target_feature(enable = "gfni", enable = "avx512vl")]
+pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) {
+    let mut add_vec_iter = add_into_vec.chunks_exact_mut(2 * GF256_HALF_ORDER);
+    let mut mul_vec_iter = mul_vec.chunks_exact(2 * GF256_HALF_ORDER);
+
+    unsafe {
+        let scalar_simd = _mm256_set1_epi8(scalar as i8);
+
+        for (add_vec_chunk, mul_vec_chunk) in add_vec_iter.by_ref().zip(mul_vec_iter.by_ref()) {
+            let mul_vec_chunk_simd = _mm256_loadu_si256(mul_vec_chunk.as_ptr().cast());
+            let scaled_res = _mm256_gf2p8mul_epi8(mul_vec_chunk_simd, scalar_simd);
+
+            let add_vec_chunk_simd = _mm256_loadu_si256(add_vec_chunk.as_ptr().cast());
+            let accum_res = _mm256_xor_si256(add_vec_chunk_simd, scaled_res);
+
+            _mm256_storeu_si256(add_vec_chunk.as_mut_ptr().cast(), accum_res);
+        }
+    }
+
+    add_vec_iter
+        .into_remainder()
+        .iter_mut()
+        .zip(mul_vec_iter.remainder().iter().map(|&src_symbol| Gf256::mul_const(src_symbol, scalar)))
+        .for_each(|(res, scaled)| {
+            *res ^= scaled;
+        });
+}
diff --git a/src/common/simd/x86/gfni/m512i.rs b/src/common/simd/x86/gfni/m512i.rs
new file mode 100644
index 0000000..6b9049f
--- /dev/null
+++ b/src/common/simd/x86/gfni/m512i.rs
@@ -0,0 +1,54 @@
+use crate::common::gf256::{GF256_HALF_ORDER, Gf256};
+
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[target_feature(enable = "gfni", enable = "avx512f")]
+pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) {
+    let mut iter = vec.chunks_exact_mut(4 * GF256_HALF_ORDER);
+
+    unsafe {
+        let scalar_simd = _mm512_set1_epi8(scalar as i8);
+        for chunk in iter.by_ref() {
+            let chunk_simd = _mm512_loadu_si512(chunk.as_ptr().cast());
+            let res = _mm512_gf2p8mul_epi8(chunk_simd, scalar_simd);
+
+            _mm512_storeu_si512(chunk.as_mut_ptr().cast(), res);
+        }
+    }
+
+    iter.into_remainder().iter_mut().for_each(|symbol| {
+        *symbol = Gf256::mul_const(*symbol, scalar);
+    });
+}
+
+#[target_feature(enable = "gfni", enable = "avx512f")]
+pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) {
+    let mut add_vec_iter = add_into_vec.chunks_exact_mut(4 * GF256_HALF_ORDER);
+    let mut mul_vec_iter = mul_vec.chunks_exact(4 * GF256_HALF_ORDER);
+
+    unsafe {
+        let scalar_simd = _mm512_set1_epi8(scalar as i8);
+
+        for (add_vec_chunk, mul_vec_chunk) in add_vec_iter.by_ref().zip(mul_vec_iter.by_ref()) {
+            let mul_vec_chunk_simd = _mm512_loadu_si512(mul_vec_chunk.as_ptr().cast());
+            let scaled_res = _mm512_gf2p8mul_epi8(mul_vec_chunk_simd, scalar_simd);
+
+            let add_vec_chunk_simd = _mm512_loadu_si512(add_vec_chunk.as_ptr().cast());
+            let accum_res = _mm512_xor_si512(add_vec_chunk_simd, scaled_res);
+
+            _mm512_storeu_si512(add_vec_chunk.as_mut_ptr().cast(), accum_res);
+        }
+    }
+
+    add_vec_iter
+        .into_remainder()
+        .iter_mut()
+        .zip(mul_vec_iter.remainder().iter().map(|&src_symbol| Gf256::mul_const(src_symbol, scalar)))
+        .for_each(|(res, scaled)| {
+            *res ^= scaled;
+        });
+}
diff --git a/src/common/simd/x86/gfni/mod.rs b/src/common/simd/x86/gfni/mod.rs
new file mode 100644
index 0000000..c988b66
--- /dev/null
+++ b/src/common/simd/x86/gfni/mod.rs
@@ -0,0 +1,3 @@
+pub mod m128i;
+pub mod m256i;
+pub mod m512i;
diff --git a/src/common/simd/x86/mod.rs b/src/common/simd/x86/mod.rs
index ba8a66d..e3b24a6 100644
--- a/src/common/simd/x86/mod.rs
+++ b/src/common/simd/x86/mod.rs
@@ -1,8 +1,24 @@
 mod avx2;
 mod avx512;
+mod gfni;
 mod ssse3;
 
 pub(super) fn gf256_inplace_mul_vec_by_scalar(vec: &mut [u8], scalar: u8) -> bool {
+    if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx512f") {
+        unsafe { gfni::m512i::mul_vec_by_scalar(vec, scalar) };
+        return true;
+    }
+
+    if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx512vl") {
+        unsafe { gfni::m256i::mul_vec_by_scalar(vec, scalar) };
+        return true;
+    }
+
+    if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx512vl") {
+        unsafe { gfni::m128i::mul_vec_by_scalar(vec, scalar) };
+        return true;
+    }
+
     if is_x86_feature_detected!("avx512bw") {
         unsafe { avx512::mul_vec_by_scalar(vec, scalar) };
         return true;
@@ -41,6 +57,21 @@ pub(super) fn gf256_inplace_add_vectors(vec_dst: &mut [u8], vec_src: &[u8]) -> b
 }
 
 pub(super) fn gf256_mul_vec_by_scalar_then_add_into_vec(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) -> bool {
+    if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx512f") {
+        unsafe { gfni::m512i::mul_vec_by_scalar_then_add_into(add_into_vec, mul_vec, scalar) };
+        return true;
+    }
+
+    if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx512vl") {
+        unsafe { gfni::m256i::mul_vec_by_scalar_then_add_into(add_into_vec, mul_vec, scalar) };
+        return true;
+    }
+
+    if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx512vl") {
+        unsafe { gfni::m128i::mul_vec_by_scalar_then_add_into(add_into_vec, mul_vec, scalar) };
+        return true;
+    }
+
     if is_x86_feature_detected!("avx512bw") {
         unsafe { avx512::mul_vec_by_scalar_then_add_into(add_into_vec, mul_vec, scalar) };
         return true;
diff --git a/src/full/encoder.rs b/src/full/encoder.rs
index e1bc9ee..e5f92bd 100644
--- a/src/full/encoder.rs
+++ b/src/full/encoder.rs
@@ -243,10 +243,10 @@ impl Encoder {
             return Err(RLNCError::InvalidOutputBuffer);
         }
 
-        let (coding_vector, mut coded_data) = full_coded_piece.split_at_mut(self.piece_count);
+        let (coding_vector, coded_data) = full_coded_piece.split_at_mut(self.piece_count);
 
         rng.fill_bytes(coding_vector);
-        self.code_with_coding_vector(&coding_vector, &mut coded_data)
+        self.code_with_coding_vector(coding_vector, coded_data)
     }
 
     /// Produces a new coded piece, random sampling a coding vector.
diff --git a/src/full/recoder.rs b/src/full/recoder.rs
index 8702261..d00d544 100644
--- a/src/full/recoder.rs
+++ b/src/full/recoder.rs
@@ -124,7 +124,7 @@ impl Recoder {
             return Err(RLNCError::InvalidOutputBuffer);
         }
 
-        let (computed_coding_vector, mut recoded_data) = full_recoded_piece.split_at_mut(self.num_pieces_coded_together);
+        let (computed_coding_vector, recoded_data) = full_recoded_piece.split_at_mut(self.num_pieces_coded_together);
 
         // Compute the resulting coding vector for the original source pieces by multiplying
         // the random sampled recoding vector by the matrix of received coding vectors.
@@ -145,7 +145,7 @@ impl Recoder {
 
         unsafe {
             self.encoder
-                .code_with_coding_vector(&self.random_recoding_vector, &mut recoded_data)
+                .code_with_coding_vector(&self.random_recoding_vector, recoded_data)
                 .unwrap_unchecked()
         };
 
diff --git a/src/lib.rs b/src/lib.rs
index 643b4d7..06d653e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -115,9 +115,9 @@
 //!
 //! ```toml
 //! [dependencies]
-//! rlnc = "=0.8.4"                                      # On x86_64 and aarch64 targets, it offers fast encoding, recoding and decoding, using SIMD intrinsics.
+//! rlnc = "=0.8.5"                                      # On x86_64 and aarch64 targets, it offers fast encoding, recoding and decoding, using SIMD intrinsics.
 //! # or
-//! rlnc = { version = "=0.8.4", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding/ recoding. Decoding is not yet parallelized.
+//! rlnc = { version = "=0.8.5", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding/ recoding. Decoding is not yet parallelized.
 //!
 //! rand = { version = "=0.9.1" } # Required for random number generation
 //! ```