diff --git a/Project.toml b/Project.toml
index f788bec7..7a0d3f84 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "ReservoirComputing"
 uuid = "7c2d2b1e-3dd4-11ea-355a-8f6a8116e294"
 authors = ["Francesco Martinuzzi"]
-version = "0.11.2"
+version = "0.11.3"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
diff --git a/README.md b/README.md
index 33e14083..a1182224 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,38 @@ Use the
 [in-development documentation](https://docs.sciml.ai/ReservoirComputing/dev/)
 to take a look at not yet released features.
 
+## Citing
+
+If you use this library in your work, please cite:
+
+```bibtex
+@article{martinuzzi2022reservoircomputing,
+  author  = {Francesco Martinuzzi and Chris Rackauckas and Anas Abdelrehim and Miguel D. Mahecha and Karin Mora},
+  title   = {ReservoirComputing.jl: An Efficient and Modular Library for Reservoir Computing Models},
+  journal = {Journal of Machine Learning Research},
+  year    = {2022},
+  volume  = {23},
+  number  = {288},
+  pages   = {1--8},
+  url     = {http://jmlr.org/papers/v23/22-0611.html}
+}
+```
+
+## Installation
+
+ReservoirComputing.jl can be installed using either of
+
+```julia_repl
+julia> ] #actually press the closing square brackets
+pkg> add ReservoirComputing
+```
+or
+
+```julia
+using Pkg
+Pkg.add("ReservoirComputing")
+```
+
 ## Quick Example
 
 To illustrate the workflow of this library we will showcase
@@ -36,7 +68,9 @@ For the `Generative` prediction we need the target data
 to be one step ahead of the training data:
 
 ```julia
-using ReservoirComputing, OrdinaryDiffEq
+using ReservoirComputing, OrdinaryDiffEq, Random
+Random.seed!(42)
+rng = MersenneTwister(17)
 
 #lorenz system parameters
 u0 = [1.0, 0.0, 0.0]
@@ -74,7 +108,8 @@ res_size = 300
 esn = ESN(input_data, input_size, res_size;
     reservoir=rand_sparse(; radius=1.2, sparsity=6 / res_size),
     input_layer=weighted_init,
-    nla_type=NLAT2())
+    nla_type=NLAT2(),
+    rng=rng)
 ```
 
 The echo state network can now be trained and tested.
@@ -110,23 +145,6 @@ plot!(transpose(test)[:, 1], transpose(test)[:, 2], transpose(test)[:, 3]; label
 
 ![lorenz_attractor](https://user-images.githubusercontent.com/10376688/81470281-5a34b580-91ea-11ea-9eea-d2b266da19f4.png)
 
-## Citing
-
-If you use this library in your work, please cite:
-
-```bibtex
-@article{JMLR:v23:22-0611,
-  author  = {Francesco Martinuzzi and Chris Rackauckas and Anas Abdelrehim and Miguel D. Mahecha and Karin Mora},
-  title   = {ReservoirComputing.jl: An Efficient and Modular Library for Reservoir Computing Models},
-  journal = {Journal of Machine Learning Research},
-  year    = {2022},
-  volume  = {23},
-  number  = {288},
-  pages   = {1--8},
-  url     = {http://jmlr.org/papers/v23/22-0611.html}
-}
-```
-
 ## Acknowledgements
 
 This project was possible thanks to initial funding through
diff --git a/docs/Project.toml b/docs/Project.toml
index 1dd62c8c..a773fcd3 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -2,6 +2,7 @@
 CellularAutomata = "878138dc-5b27-11ea-1a71-cb95d38d6b29"
 DifferentialEquations = "0c46a032-eb83-5123-abaf-570d42b7fbaa"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
diff --git a/docs/make.jl b/docs/make.jl
index 7bbd16ca..3194943c 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,4 +1,4 @@
-using Documenter, ReservoirComputing
+using Documenter, DocumenterCitations, ReservoirComputing
 
 cp("./docs/Manifest.toml", "./docs/src/assets/Manifest.toml"; force = true)
 cp("./docs/Project.toml", "./docs/src/assets/Project.toml"; force = true)
@@ -8,9 +8,15 @@ ENV["GKSwstype"] = "100"
 include("pages.jl")
 mathengine = Documenter.MathJax()
 
+bib = CitationBibliography(
+    joinpath(@__DIR__, "src", "refs.bib");
+    style = :authoryear
+)
+
 makedocs(; modules = [ReservoirComputing],
     sitename = "ReservoirComputing.jl",
     clean = true, doctest = false, linkcheck = true,
+    plugins = [bib],
     format = Documenter.HTML(;
         mathengine,
         assets = ["assets/favicon.ico"],
diff --git a/docs/pages.jl b/docs/pages.jl
index 778f561f..773ba0db 100644
--- a/docs/pages.jl
+++ b/docs/pages.jl
@@ -20,5 +20,5 @@ pages = [
         "ESN Initializers" => "api/inits.md",
         "ESN Drivers" => "api/esn_drivers.md",
         "ESN Variations" => "api/esn_variations.md",
-        "ReCA" => "api/reca.md"]
+        "ReCA" => "api/reca.md"]    #"References" => "references.md"
 ]
diff --git a/docs/src/api/esn_drivers.md b/docs/src/api/esn_drivers.md
index 0bf0a388..b752741b 100644
--- a/docs/src/api/esn_drivers.md
+++ b/docs/src/api/esn_drivers.md
@@ -14,3 +14,10 @@ The `GRU` driver also provides the user with the choice of the possible variants
 ```
 
 Please refer to the original papers for more detail about these architectures.
+
+## References
+
+```@bibliography
+Pages = ["esn_drivers.md"]
+Canonical = false
+```
\ No newline at end of file
diff --git a/docs/src/api/inits.md b/docs/src/api/inits.md
index ac4209c6..01680ee3 100644
--- a/docs/src/api/inits.md
+++ b/docs/src/api/inits.md
@@ -44,3 +44,10 @@
     self_loop!
     add_jumps!
 ```
+
+## References
+
+```@bibliography
+Pages = ["inits.md"]
+Canonical = false
+```
\ No newline at end of file
diff --git a/docs/src/api/states.md b/docs/src/api/states.md
index 1f1fbe9c..caa54778 100644
--- a/docs/src/api/states.md
+++ b/docs/src/api/states.md
@@ -25,3 +25,10 @@
 ```@docs
     ReservoirComputing.create_states
 ```
+
+## References
+
+```@bibliography
+Pages = ["states.md"]
+Canonical = false
+```
\ No newline at end of file
diff --git a/docs/src/esn_tutorials/change_layers.md b/docs/src/esn_tutorials/change_layers.md
index 438f6b78..525d1499 100644
--- a/docs/src/esn_tutorials/change_layers.md
+++ b/docs/src/esn_tutorials/change_layers.md
@@ -26,7 +26,7 @@ Custom layers only need to follow these APIs to be compatible with ReservoirComp
 
 ## Example of minimally complex ESN
 
-Using [^rodan2012] and [^rodan2010] as references this section will provide an
+Using [Rodan2012](@cite) and [Rodan2011](@cite) as references this section will provide an
 example on how to change both the input layer and the reservoir for ESNs.
 
 The task for this example will be the one step ahead prediction of the Henon map.
@@ -77,11 +77,9 @@ end
 As it is possible to see, changing layers in ESN models is straightforward.
 Be sure to check the API documentation for a full list of reservoir and layers.
 
-## Bibliography
+## References
 
-[^rodan2012]: Rodan, Ali, and Peter Tiňo.
-    “Simple deterministically constructed cycle reservoirs with regular jumps.”
-    Neural computation 24.7 (2012): 1822-1852.
-[^rodan2010]: Rodan, Ali, and Peter Tiňo.
-    “Minimum complexity echo state network.”
-    IEEE transactions on neural networks 22.1 (2010): 131-144.
+```@bibliography
+Pages = ["change_layers.md"]
+Canonical = false
+```
diff --git a/docs/src/esn_tutorials/deep_esn.md b/docs/src/esn_tutorials/deep_esn.md
index 26ab0f00..9e259eb4 100644
--- a/docs/src/esn_tutorials/deep_esn.md
+++ b/docs/src/esn_tutorials/deep_esn.md
@@ -2,7 +2,7 @@
 
 Deep Echo State Network architectures started to gain some traction recently. In this guide, we illustrate how it is possible to use ReservoirComputing.jl to build a deep ESN.
 
-The network implemented in this library is taken from [^1]. It works by stacking reservoirs on top of each other, feeding the output from one into the next. The states are obtained by merging all the inner states of the stacked reservoirs. For a more in-depth explanation, refer to the paper linked above.
+The network implemented in this library is taken from [Gallicchio2017](@cite). It works by stacking reservoirs on top of each other, feeding the output from one into the next. The states are obtained by merging all the inner states of the stacked reservoirs. For a more in-depth explanation, refer to the paper linked above.
 
 ## Lorenz Example
 
@@ -88,6 +88,9 @@ plot(p1, p2, p3; plot_title="Lorenz System Coordinates",
     legendfontsize=12, titlefontsize=20)
 ```
 
-## Documentation
+## References
 
-[^1]: Gallicchio, Claudio, and Alessio Micheli. "_Deep echo state network (deepesn): A brief survey._" arXiv preprint arXiv:1712.04323 (2017).
+```@bibliography
+Pages = ["deep_esn.md"]
+Canonical = false
+```
\ No newline at end of file
diff --git a/docs/src/esn_tutorials/different_drivers.md b/docs/src/esn_tutorials/different_drivers.md
index 9b72fdf0..e90afbbc 100644
--- a/docs/src/esn_tutorials/different_drivers.md
+++ b/docs/src/esn_tutorials/different_drivers.md
@@ -4,7 +4,7 @@ While the original implementation of the Echo State Network implemented the mode
 
 ## Multiple Activation Function RNN
 
-Based on the double activation function ESN (DAFESN) proposed in [^1], the Multiple Activation Function ESN expands the idea and allows a custom number of activation functions to be used in the reservoir dynamics. This can be thought of as a linear combination of multiple activation functions with corresponding parameters.
+Based on the double activation function ESN (DAFESN) proposed in [Lun2015](@cite), the Multiple Activation Function ESN expands the idea and allows a custom number of activation functions to be used in the reservoir dynamics. This can be thought of as a linear combination of multiple activation functions with corresponding parameters.
 
 ```math
 \mathbf{x}(t+1) = (1-\alpha)\mathbf{x}(t) + \lambda_1 f_1(\mathbf{W}\mathbf{x}(t)+\mathbf{W}_{in}\mathbf{u}(t)) + \dots + \lambda_D f_D(\mathbf{W}\mathbf{x}(t)+\mathbf{W}_{in}\mathbf{u}(t))
@@ -14,7 +14,7 @@ where ``D`` is the number of activation functions and respective parameters chos
 
 The method to call to use the multiple activation function ESN is `MRNN(activation_function, leaky_coefficient, scaling_factor)`. The arguments can be used as both `args` and `kwargs`. `activation_function` and `scaling_factor` have to be vectors (or tuples) containing the chosen activation functions and respective scaling factors (``f_1,...,f_D`` and ``\lambda_1,...,\lambda_D`` following the nomenclature introduced above). The `leaky_coefficient` represents ``\alpha`` and it is a single value.
 
-Starting with the example, the data used is based on the following function based on the DAFESN paper [^1].
+Starting with the example, the data used is based on the following function based on the DAFESN paper [Lun2015](@cite).
 
 ```@example mrnn
 u(t) = sin(t) + sin(0.51 * t) + sin(0.22 * t) + sin(0.1002 * t) + sin(0.05343 * t)
@@ -87,7 +87,7 @@ In this example, it is also possible to observe the input of parameters to the m
 
 ## Gated Recurrent Unit
 
-Gated Recurrent Units (GRUs) [^2] have been proposed in more recent years with the intent of limiting notable problems of RNNs, like the vanishing gradient. This change in the underlying equations can be easily transported into the Reservoir Computing paradigm, by switching the RNN equations in the reservoir with the GRU equations. This approach has been explored in [^3] and [^4]. Different variations of GRU have been proposed [^5][^6]; this section is subdivided into different sections that go into detail about the governing equations and the implementation of them into ReservoirComputing.jl. Like before, to access the GRU reservoir driver, it suffices to change the `reservoir_diver` keyword argument for `ESN` with `GRU()`. All the variations that will be presented can be used in this package by leveraging the keyword argument `variant` in the method `GRU()` and specifying the chosen variant: `FullyGated()` or `Minimal()`. Other variations are possible by modifying the inner layers and reservoirs. The default is set to the standard version `FullyGated()`. The first section will go into more detail about the default of the `GRU()` method, and the following ones will refer to it to minimize repetitions. This example was run on Julia v1.7.2.
+Gated Recurrent Units (GRUs) [Cho2014](@cite) have been proposed in more recent years with the intent of limiting notable problems of RNNs, like the vanishing gradient. This change in the underlying equations can be easily transported into the Reservoir Computing paradigm, by switching the RNN equations in the reservoir with the GRU equations. This approach has been explored in [Wang2020](@cite) and [Sarli2020](@cite). Different variations of GRU have been proposed [Dey2017](@cite); this section is subdivided into different sections that go into detail about the governing equations and the implementation of them into ReservoirComputing.jl. Like before, to access the GRU reservoir driver, it suffices to change the `reservoir_diver` keyword argument for `ESN` with `GRU()`. All the variations that will be presented can be used in this package by leveraging the keyword argument `variant` in the method `GRU()` and specifying the chosen variant: `FullyGated()` or `Minimal()`. Other variations are possible by modifying the inner layers and reservoirs. The default is set to the standard version `FullyGated()`. The first section will go into more detail about the default of the `GRU()` method, and the following ones will refer to it to minimize repetitions.
 
 ### Standard GRU
 
@@ -104,7 +104,7 @@ Going over the `GRU` keyword argument, it will be explained how to feed the desi
 
   - `activation_function` is a vector with default values `[NNlib.sigmoid, NNlib.sigmoid, tanh]`. This argument controls the activation functions of the GRU, going from top to bottom. Changing the first element corresponds to changing the activation function for ``\mathbf{r}(t)`` and so on.
   - `inner_layer` is a vector with default values `fill(DenseLayer(), 2)`. This keyword argument controls the ``\mathbf{W}_{\text{in}}``s going from top to bottom like before.
-  - `reservoir` is a vector with default value `fill(RandSparseReservoir(), 2)`. In a similar fashion to `inner_layer`, this keyword argument controls the reservoir matrix construction in a top to bottom order.
+  - `reservoir` is a vector with default value `fill(RandSparseReservoir(), 2)`. Similarly to `inner_layer`, this keyword argument controls the reservoir matrix construction in a top to bottom order.
   - `bias` is again a vector with default value `fill(DenseLayer(), 2)`. It is meant to control the ``\mathbf{b}``s, going as usual from top to bottom.
   - `variant` controls the GRU variant. The default value is set to `FullyGated()`.
 
@@ -161,7 +161,7 @@ This variation can be obtained by setting `variation=Minimal()`. The `inner_laye
 
 To showcase the use of the `GRU()` method, this section will only illustrate the standard `FullyGated()` version. The full script for this example with the data can be found [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/tree/main/change_drivers/gru).
 
-The data used for this example is the Santa Fe laser dataset [^7] retrieved from [here](https://web.archive.org/web/20160427182805/http://www-psych.stanford.edu/%7Eandreas/Time-Series/SantaFe.html). The data is split to account for a next step prediction.
+The data used for this example is the Santa Fe laser dataset [Hbner1989](@cite) retrieved from [here](https://web.archive.org/web/20160427182805/http://www-psych.stanford.edu/%7Eandreas/Time-Series/SantaFe.html). The data is split to account for a next step prediction.
 
 ```@example gru
 using DelimitedFiles
@@ -241,10 +241,9 @@ println(msd(testing_target, output))
 println(msd(testing_target, output_rnn))
 ```
 
-[^1]: Lun, Shu-Xian, et al. "_A novel model of leaky integrator echo state network for time-series prediction._" Neurocomputing 159 (2015): 58-66.
-[^2]: Cho, Kyunghyun, et al. “_Learning phrase representations using RNN encoder-decoder for statistical machine translation._” arXiv preprint arXiv:1406.1078 (2014).
-[^3]: Wang, Xinjie, Yaochu Jin, and Kuangrong Hao. "_A Gated Recurrent Unit based Echo State Network._" 2020 International Joint Conference on Neural Networks (IJCNN). IEEE, 2020.
-[^4]: Di Sarli, Daniele, Claudio Gallicchio, and Alessio Micheli. "_Gated Echo State Networks: a preliminary study._" 2020 International Conference on INnovations in Intelligent SysTems and Applications (INISTA). IEEE, 2020.
-[^5]: Dey, Rahul, and Fathi M. Salem. "_Gate-variants of gated recurrent unit (GRU) neural networks._" 2017 IEEE 60th international midwest symposium on circuits and systems (MWSCAS). IEEE, 2017.
-[^6]: Zhou, Guo-Bing, et al. "_Minimal gated unit for recurrent neural networks._" International Journal of Automation and Computing 13.3 (2016): 226-234.
-[^7]: Hübner, Uwe, Nimmi B. Abraham, and Carlos O. Weiss. "_Dimensions and entropies of chaotic intensity pulsations in a single-mode far-infrared NH 3 laser._" Physical Review A 40.11 (1989): 6354.
+## References
+
+```@bibliography
+Pages = ["different_drivers.md"]
+Canonical = false
+```
diff --git a/docs/src/esn_tutorials/hybrid.md b/docs/src/esn_tutorials/hybrid.md
index cd68fe6c..cbe7ac4d 100644
--- a/docs/src/esn_tutorials/hybrid.md
+++ b/docs/src/esn_tutorials/hybrid.md
@@ -1,6 +1,6 @@
 # Hybrid Echo State Networks
 
-Following the idea of giving physical information to machine learning models, the hybrid echo state networks [^1] try to achieve this results by feeding model data into the ESN. In this example, it is explained how to create and leverage such models in ReservoirComputing.jl.
+Following the idea of giving physical information to machine learning models, the hybrid echo state networks [Pathak2018](@cite) try to achieve this results by feeding model data into the ESN. In this example, it is explained how to create and leverage such models in ReservoirComputing.jl.
 
 ## Generating the data
 
@@ -94,6 +94,9 @@ plot(p1, p2, p3; plot_title="Lorenz System Coordinates",
     legendfontsize=12, titlefontsize=20)
 ```
 
-## Bibliography
+## References
 
-[^1]: Pathak, Jaideep, et al. "_Hybrid forecasting of chaotic processes: Using machine learning in conjunction with a knowledge-based model._" Chaos: An Interdisciplinary Journal of Nonlinear Science 28.4 (2018): 041101.
+```@bibliography
+Pages = ["hybrid.md"]
+Canonical = false
+```
\ No newline at end of file
diff --git a/docs/src/esn_tutorials/lorenz_basic.md b/docs/src/esn_tutorials/lorenz_basic.md
index 081d0ce9..9e67d1aa 100644
--- a/docs/src/esn_tutorials/lorenz_basic.md
+++ b/docs/src/esn_tutorials/lorenz_basic.md
@@ -40,7 +40,7 @@ It is *important* to notice that the data needs to be formatted in a matrix with
 
 ## Building the Echo State Network
 
-Once the data is ready, it is possible to define the parameters for the ESN and the `ESN` struct itself. In this example, the values from [^1] are loosely followed as general guidelines.
+Once the data is ready, it is possible to define the parameters for the ESN and the `ESN` struct itself. In this example, the values from [Pathak2017](@cite) are loosely followed as general guidelines.
 
 ```@example lorenz
 using ReservoirComputing
@@ -63,9 +63,9 @@ esn = ESN(input_data, in_size, res_size;
 
 Most of the parameters chosen here mirror the default ones, so a direct call is not necessary. The readme example is identical to this one, except for the explicit call. Going line by line to see what is happening, starting from `res_size`: this value determines the dimensions of the reservoir matrix. In this case, a size of 300 has been chosen, so the reservoir matrix will be 300 x 300. This is not always the case, since some input layer constructions can modify the dimensions of the reservoir, but in that case, everything is taken care of internally.
 
-The `res_radius` determines the scaling of the spectral radius of the reservoir matrix; a proper scaling is necessary to assure the Echo State Property. The default value in the `rand_sparse` method is 1.0 in accordance with the most commonly followed guidelines found in the literature (see [^2] and references therein). The `sparsity` of the reservoir matrix in this case is obtained by choosing a degree of connections and dividing that by the reservoir size. Of course, it is also possible to simply choose any value between 0.0 and 1.0 to test behaviors for different sparsity values.
+The `res_radius` determines the scaling of the spectral radius of the reservoir matrix; a proper scaling is necessary to assure the Echo State Property. The default value in the `rand_sparse` method is 1.0 in accordance with the most commonly followed guidelines found in the literature (see [Lukoeviius2012](@cite) and references therein). The `sparsity` of the reservoir matrix in this case is obtained by choosing a degree of connections and dividing that by the reservoir size. Of course, it is also possible to simply choose any value between 0.0 and 1.0 to test behaviors for different sparsity values.
 
-The value of `input_scaling` determines the upper and lower bounds of the uniform distribution of the weights in the `weighted_init`. The value of 0.1 represents the default. The default input layer is the `scaled_rand`, a dense matrix. The details of the weighted version can be found in [^3], for this example, this version returns the best results.
+The value of `input_scaling` determines the upper and lower bounds of the uniform distribution of the weights in the `weighted_init`. The value of 0.1 represents the default. The default input layer is the `scaled_rand`, a dense matrix. The details of the weighted version can be found in [Lu2017](@cite), for this example, this version returns the best results.
 
 The reservoir driver represents the dynamics of the reservoir. In the standard ESN definition, these dynamics are obtained through a Recurrent Neural Network (RNN), and this is reflected by calling the `RNN` driver for the `ESN` struct. This option is set as the default, and unless there is the need to change parameters, it is not needed. The full equation is the following:
 
@@ -126,8 +126,9 @@ plot(p1, p2, p3; plot_title="Lorenz System Coordinates",
     legendfontsize=12, titlefontsize=20)
 ```
 
-## Bibliography
+## References
 
-[^1]: Pathak, Jaideep, et al. "_Using machine learning to replicate chaotic attractors and calculate Lyapunov exponents from data._" Chaos: An Interdisciplinary Journal of Nonlinear Science 27.12 (2017): 121102.
-[^2]: Lukoševičius, Mantas. "_A practical guide to applying echo state networks._" Neural networks: Tricks of the trade. Springer, Berlin, Heidelberg, 2012. 659-686.
-[^3]: Lu, Zhixin, et al. "_Reservoir observers: Model-free inference of unmeasured variables in chaotic systems._" Chaos: An Interdisciplinary Journal of Nonlinear Science 27.4 (2017): 041102.
+```@bibliography
+Pages = ["lorenz_basic.md"]
+Canonical = false
+```
\ No newline at end of file
diff --git a/docs/src/general/states_variation.md b/docs/src/general/states_variation.md
index e5767a24..bd6e66a2 100644
--- a/docs/src/general/states_variation.md
+++ b/docs/src/general/states_variation.md
@@ -18,7 +18,7 @@ You can choose not to apply any of these changes to the states by calling `Stand
 
 ## Non-Linear Algorithms
 
-First introduced in [^1] and expanded in [^2], non-linear algorithms are nonlinear combinations of the columns of the matrix states. There are three such algorithms implemented in ReservoirComputing.jl, and you can choose which one to use with the `nla_type` keyword argument. The default value is set to `NLADefault()`, which means no non-linear algorithm is applied.
+First introduced in [Pathak2017](@cite) and expanded in [Chattopadhyay2020](@cite), non-linear algorithms are nonlinear combinations of the columns of the matrix states. There are three such algorithms implemented in ReservoirComputing.jl, and you can choose which one to use with the `nla_type` keyword argument. The default value is set to `NLADefault()`, which means no non-linear algorithm is applied.
 
 The available non-linear algorithms are:
 
@@ -31,23 +31,27 @@ These algorithms perform specific operations on the reservoir states. To provide
 **NLAT1**
 
 ```math
-\tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j} \times \textbf{x}_{i,j} \ \ \text{if \textit{j} is odd} \\
-\tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j}  \ \ \text{if \textit{j} is even}
+\tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j} \times \textbf{x}_{i,j} \ \ \text{if j is odd} \\
+\tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j}  \ \ \text{if j is even}
 ```
 
 **NLAT2**
 
 ```math
-\tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j-1} \times \textbf{x}_{i,j-2} \ \ \text{if \textit{j} > 1 is odd} \\
-\tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j}  \ \ \text{if \textit{j} is 1 or even}
+\tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j-1} \times \textbf{x}_{i,j-2} \ \ \text{if j > 1 is odd} \\
+\tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j}  \ \ \text{if j is 1 or even}
 ```
 
 **NLAT3**
 
 ```math
-\tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j-1} \times \textbf{x}_{i,j+1} \ \ \text{if \textit{j} > 1 is odd} \\
-\tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j}  \ \ \text{if \textit{j} is 1 or even}
+\tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j-1} \times \textbf{x}_{i,j+1} \ \ \text{if j > 1 is odd} \\
+\tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j}  \ \ \text{if j is 1 or even}
 ```
 
-[^1]: Pathak, Jaideep, et al. "_Using machine learning to replicate chaotic attractors and calculate Lyapunov exponents from data._" Chaos: An Interdisciplinary Journal of Nonlinear Science 27.12 (2017): 121102.
-[^2]: Chattopadhyay, Ashesh, Pedram Hassanzadeh, and Devika Subramanian. "_Data-driven predictions of a multiscale Lorenz 96 chaotic system using machine-learning methods: reservoir computing, artificial neural network, and long short-term memory network._" Nonlinear Processes in Geophysics 27.3 (2020): 373-389.
+## References
+
+```@bibliography
+Pages = ["states_variation.md"]
+Canonical = false
+```
\ No newline at end of file
diff --git a/docs/src/reca_tutorials/reca.md b/docs/src/reca_tutorials/reca.md
index ac05b4fb..fe8cdf8d 100644
--- a/docs/src/reca_tutorials/reca.md
+++ b/docs/src/reca_tutorials/reca.md
@@ -1,8 +1,8 @@
 # Reservoir Computing using Cellular Automata
 
-Reservoir Computing based on Elementary Cellular Automata (ECA) has been recently introduced. Dubbed as ReCA [^1][^2] it proposed the advantage of storing the reservoir states as binary data. Less parameter tuning represents another advantage of this model. The architecture implemented in ReservoirComputing.jl follows [^3] which builds on top of the original implementation, improving the results. It is strongly suggested to go through the paper to get a solid understanding of the model before delving into experimentation with the code.
+Reservoir Computing based on Elementary Cellular Automata (ECA) has been recently introduced. Dubbed as ReCA [Yilmaz2014](@cite) [Margem2017](@cite) it proposed the advantage of storing the reservoir states as binary data. Less parameter tuning represents another advantage of this model. The architecture implemented in ReservoirComputing.jl follows [Nichele2017](@cite) which builds on top of the original implementation, improving the results. It is strongly suggested to go through the paper to get a solid understanding of the model before delving into experimentation with the code.
 
-To showcase how to use these models, this page illustrates the performance of ReCA in the 5 bit memory task [^4]. The script for the example and companion data can be found [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/tree/main/reca).
+To showcase how to use these models, this page illustrates the performance of ReCA in the 5 bit memory task.
 
 ## 5 bit memory task
 
@@ -46,7 +46,3 @@ final_pred = convert(AbstractArray{Float32}, prediction .> 0.5)
 final_pred == output
 ```
 
-[^1]: Yilmaz, Ozgur. "Reservoir computing using cellular automata." arXiv preprint arXiv:1410.0162 (2014).
-[^2]: Margem, Mrwan, and Ozgür Yilmaz. "An experimental study on cellular automata reservoir in pathological sequence learning tasks." (2017).
-[^3]: Nichele, Stefano, and Andreas Molund. "Deep reservoir computing using cellular automata." arXiv preprint arXiv:1703.02806 (2017).
-[^4]: Hochreiter, Sepp, and Jürgen Schmidhuber. "Long short-term memory." Neural computation 9.8 (1997): 1735-1780.
diff --git a/docs/src/references.md b/docs/src/references.md
new file mode 100644
index 00000000..4b476771
--- /dev/null
+++ b/docs/src/references.md
@@ -0,0 +1,4 @@
+# References
+
+```@bibliography
+```
\ No newline at end of file
diff --git a/docs/src/refs.bib b/docs/src/refs.bib
new file mode 100644
index 00000000..7a0ebc98
--- /dev/null
+++ b/docs/src/refs.bib
@@ -0,0 +1,329 @@
+@article{Lu2017,
+  title = {Reservoir observers: Model-free inference of unmeasured variables in chaotic systems},
+  volume = {27},
+  ISSN = {1089-7682},
+  url = {http://dx.doi.org/10.1063/1.4979665},
+  DOI = {10.1063/1.4979665},
+  number = {4},
+  journal = {Chaos: An Interdisciplinary Journal of Nonlinear Science},
+  publisher = {AIP Publishing},
+  author = {Lu,  Zhixin and Pathak,  Jaideep and Hunt,  Brian and Girvan,  Michelle and Brockett,  Roger and Ott,  Edward},
+  year = {2017},
+  month = apr 
+}
+
+@article{Pathak2018,
+  title = {Hybrid forecasting of chaotic processes: Using machine learning in conjunction with a knowledge-based model},
+  volume = {28},
+  ISSN = {1089-7682},
+  url = {http://dx.doi.org/10.1063/1.5028373},
+  DOI = {10.1063/1.5028373},
+  number = {4},
+  journal = {Chaos: An Interdisciplinary Journal of Nonlinear Science},
+  publisher = {AIP Publishing},
+  author = {Pathak,  Jaideep and Wikner,  Alexander and Fussell,  Rebeckah and Chandra,  Sarthak and Hunt,  Brian R. and Girvan,  Michelle and Ott,  Edward},
+  year = {2018},
+  month = apr 
+}
+
+@article{Rodan2011,
+  title = {Minimum Complexity Echo State Network},
+  volume = {22},
+  ISSN = {1941-0093},
+  url = {http://dx.doi.org/10.1109/TNN.2010.2089641},
+  DOI = {10.1109/tnn.2010.2089641},
+  number = {1},
+  journal = {IEEE Transactions on Neural Networks},
+  publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
+  author = {Rodan,  A and Tino,  P},
+  year = {2011},
+  month = jan,
+  pages = {131–144}
+}
+
+@article{Xie2024,
+  title = {Time Series Prediction of ESN Based on Chebyshev Mapping and Strongly Connected Topology},
+  volume = {56},
+  ISSN = {1573-773X},
+  url = {http://dx.doi.org/10.1007/s11063-024-11474-7},
+  DOI = {10.1007/s11063-024-11474-7},
+  number = {1},
+  journal = {Neural Processing Letters},
+  publisher = {Springer Science and Business Media LLC},
+  author = {Xie,  Minzhi and Wang,  Qianxue and Yu,  Simin},
+  year = {2024},
+  month = feb 
+}
+
+@article{Wang2022,
+  title = {Echo state network with logistic mapping and bias dropout for time series prediction},
+  volume = {489},
+  ISSN = {0925-2312},
+  url = {http://dx.doi.org/10.1016/j.neucom.2022.03.018},
+  DOI = {10.1016/j.neucom.2022.03.018},
+  journal = {Neurocomputing},
+  publisher = {Elsevier BV},
+  author = {Wang,  Heshan and Liu,  Yuxi and Lu,  Peng and Luo,  Yong and Wang,  Dongshu and Xu,  Xiangyang},
+  year = {2022},
+  month = jun,
+  pages = {196–210}
+}
+
+@article{Xie2024,
+  title = {Time Series Prediction of ESN Based on Chebyshev Mapping and Strongly Connected Topology},
+  volume = {56},
+  ISSN = {1573-773X},
+  url = {http://dx.doi.org/10.1007/s11063-024-11474-7},
+  DOI = {10.1007/s11063-024-11474-7},
+  number = {1},
+  journal = {Neural Processing Letters},
+  publisher = {Springer Science and Business Media LLC},
+  author = {Xie,  Minzhi and Wang,  Qianxue and Yu,  Simin},
+  year = {2024},
+  month = feb 
+}
+
+@article{Griffith2019,
+  title = {Forecasting chaotic systems with very low connectivity reservoir computers},
+  volume = {29},
+  ISSN = {1089-7682},
+  url = {http://dx.doi.org/10.1063/1.5120710},
+  DOI = {10.1063/1.5120710},
+  number = {12},
+  journal = {Chaos: An Interdisciplinary Journal of Nonlinear Science},
+  publisher = {AIP Publishing},
+  author = {Griffith,  Aaron and Pomerance,  Andrew and Gauthier,  Daniel J.},
+  year = {2019},
+  month = dec 
+}
+
+@article{Fu2023,
+  title = {A double-cycle echo state network topology for time series prediction},
+  volume = {33},
+  ISSN = {1089-7682},
+  url = {http://dx.doi.org/10.1063/5.0159966},
+  DOI = {10.1063/5.0159966},
+  number = {9},
+  journal = {Chaos: An Interdisciplinary Journal of Nonlinear Science},
+  publisher = {AIP Publishing},
+  author = {Fu,  Jun and Li,  Guangli and Tang,  Jianfeng and Xia,  Lei and Wang,  Lidan and Duan,  Shukai},
+  year = {2023},
+  month = sep 
+}
+
+@article{Elsarraj2019,
+  title={Demystifying echo state network with deterministic simple topologies},
+  author={Elsarraj, Duaa and Qisi, Maha Al and Rodan, Ali and Obeid, Nadim and Sharieh, Ahmad and Faris, Hossam},
+  journal={International Journal of Computational Science and Engineering},
+  volume={19},
+  number={3},
+  pages={407--417},
+  year={2019},
+  publisher={Inderscience Publishers (IEL)}
+}
+
+@article{Viehweg2025,
+  title={Deterministic Reservoir Computing for Chaotic Time Series Prediction},
+  author={Viehweg, Johannes and Poll, Constanze and M{\"a}der, Patrick},
+  journal={arXiv preprint arXiv:2501.15615},
+  year={2025}
+}
+
+@article{Yang2018,
+  title = {Design of polynomial echo state networks for time series prediction},
+  volume = {290},
+  ISSN = {0925-2312},
+  url = {http://dx.doi.org/10.1016/j.neucom.2018.02.036},
+  DOI = {10.1016/j.neucom.2018.02.036},
+  journal = {Neurocomputing},
+  publisher = {Elsevier BV},
+  author = {Yang,  Cuili and Qiao,  Junfei and Han,  Honggui and Wang,  Lei},
+  year = {2018},
+  month = may,
+  pages = {148–160}
+}
+
+@article{Rodan2012,
+  title = {Simple Deterministically Constructed Cycle Reservoirs with Regular Jumps},
+  volume = {24},
+  ISSN = {1530-888X},
+  url = {http://dx.doi.org/10.1162/NECO_a_00297},
+  DOI = {10.1162/neco_a_00297},
+  number = {7},
+  journal = {Neural Computation},
+  publisher = {MIT Press - Journals},
+  author = {Rodan,  Ali and Tiňo,  Peter},
+  year = {2012},
+  month = jul,
+  pages = {1822–1852}
+}
+
+@article{Pathak2017,
+  title = {Using machine learning to replicate chaotic attractors and calculate Lyapunov exponents from data},
+  volume = {27},
+  ISSN = {1089-7682},
+  url = {http://dx.doi.org/10.1063/1.5010300},
+  DOI = {10.1063/1.5010300},
+  number = {12},
+  journal = {Chaos: An Interdisciplinary Journal of Nonlinear Science},
+  publisher = {AIP Publishing},
+  author = {Pathak,  Jaideep and Lu,  Zhixin and Hunt,  Brian R. and Girvan,  Michelle and Ott,  Edward},
+  year = {2017},
+  month = dec 
+}
+
+@article{Chattopadhyay2020,
+  title = {Data-driven predictions of a multiscale Lorenz 96 chaotic system using machine-learning methods: reservoir computing,  artificial neural network,  and long short-term memory network},
+  volume = {27},
+  ISSN = {1607-7946},
+  url = {http://dx.doi.org/10.5194/npg-27-373-2020},
+  DOI = {10.5194/npg-27-373-2020},
+  number = {3},
+  journal = {Nonlinear Processes in Geophysics},
+  publisher = {Copernicus GmbH},
+  author = {Chattopadhyay,  Ashesh and Hassanzadeh,  Pedram and Subramanian,  Devika},
+  year = {2020},
+  month = jul,
+  pages = {373–389}
+}
+
+@inbook{Lukoeviius2012,
+  title = {A Practical Guide to Applying Echo State Networks},
+  ISBN = {9783642352898},
+  ISSN = {1611-3349},
+  url = {http://dx.doi.org/10.1007/978-3-642-35289-8_36},
+  DOI = {10.1007/978-3-642-35289-8_36},
+  booktitle = {Neural Networks: Tricks of the Trade},
+  publisher = {Springer Berlin Heidelberg},
+  author = {Lukoševičius,  Mantas},
+  year = {2012},
+  pages = {659–686}
+}
+
+@article{Lun2015,
+  title = {A novel model of leaky integrator echo state network for time-series prediction},
+  volume = {159},
+  ISSN = {0925-2312},
+  url = {http://dx.doi.org/10.1016/j.neucom.2015.02.029},
+  DOI = {10.1016/j.neucom.2015.02.029},
+  journal = {Neurocomputing},
+  publisher = {Elsevier BV},
+  author = {Lun,  Shu-Xian and Yao,  Xian-Shuang and Qi,  Hong-Yun and Hu,  Hai-Feng},
+  year = {2015},
+  month = jul,
+  pages = {58–66}
+}
+
+@article{Cho2014,
+  title={Learning phrase representations using RNN encoder-decoder for statistical machine translation},
+  author={Cho, Kyunghyun and Van Merri{\"e}nboer, Bart and Gulcehre, Caglar and Bahdanau, Dzmitry and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua},
+  journal={arXiv preprint arXiv:1406.1078},
+  year={2014}
+}
+
+@inproceedings{Wang2020,
+  title = {A Gated Recurrent Unit based Echo State Network},
+  url = {http://dx.doi.org/10.1109/IJCNN48605.2020.9206786},
+  DOI = {10.1109/ijcnn48605.2020.9206786},
+  booktitle = {2020 International Joint Conference on Neural Networks (IJCNN)},
+  publisher = {IEEE},
+  author = {Wang,  Xinjie and Jin,  Yaochu and Hao,  Kuangrong},
+  year = {2020},
+  month = jul,
+  pages = {1–7}
+}
+
+@inproceedings{Sarli2020,
+  title = {Gated Echo State Networks: a preliminary study},
+  url = {http://dx.doi.org/10.1109/INISTA49547.2020.9194681},
+  DOI = {10.1109/inista49547.2020.9194681},
+  booktitle = {2020 International Conference on INnovations in Intelligent SysTems and Applications (INISTA)},
+  publisher = {IEEE},
+  author = {Sarli,  Daniele Di and Gallicchio,  Claudio and Micheli,  Alessio},
+  year = {2020},
+  month = aug,
+  pages = {1–5}
+}
+
+@inproceedings{Dey2017,
+  title = {Gate-variants of Gated Recurrent Unit (GRU) neural networks},
+  url = {http://dx.doi.org/10.1109/MWSCAS.2017.8053243},
+  DOI = {10.1109/mwscas.2017.8053243},
+  booktitle = {2017 IEEE 60th International Midwest Symposium on Circuits and Systems (MWSCAS)},
+  publisher = {IEEE},
+  author = {Dey,  Rahul and Salem,  Fathi M.},
+  year = {2017},
+  month = aug,
+  pages = {1597–1600}
+}
+
+@article{Hbner1989,
+  title = {Dimensions and entropies of chaotic intensity pulsations in a single-mode far-infrared<mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" display="inline"><mml:mrow><mml:msub><mml:mrow><mml:mi mathvariant="normal">NH</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:math>laser},
+  volume = {40},
+  ISSN = {0556-2791},
+  url = {http://dx.doi.org/10.1103/PhysRevA.40.6354},
+  DOI = {10.1103/physreva.40.6354},
+  number = {11},
+  journal = {Physical Review A},
+  publisher = {American Physical Society (APS)},
+  author = {H\"{u}bner,  U. and Abraham,  N. B. and Weiss,  C. O.},
+  year = {1989},
+  month = dec,
+  pages = {6354–6365}
+}
+
+@article{Gallicchio2017,
+  title={Deep echo state network (deepesn): A brief survey},
+  author={Gallicchio, Claudio and Micheli, Alessio},
+  journal={arXiv preprint arXiv:1712.04323},
+  year={2017}
+}
+
+@article{Yilmaz2014,
+  title={Reservoir computing using cellular automata},
+  author={Yilmaz, Ozgur},
+  journal={arXiv preprint arXiv:1410.0162},
+  year={2014}
+}
+
+@misc{Margem2017,
+  title={An experimental study on cellular automata reservoir in pathological sequence learning tasks},
+  author={Margem, Mrwan and Yilmaz, Ozg{\"u}r},
+  year={2017},
+  publisher={Jul}
+}
+
+@article{Nichele2017,
+  title={Deep reservoir computing using cellular automata},
+  author={Nichele, Stefano and Molund, Andreas},
+  journal={arXiv preprint arXiv:1703.02806},
+  year={2017}
+}
+
+@article{Barbosa2021,
+  title = {Symmetry-aware reservoir computing},
+  volume = {104},
+  ISSN = {2470-0053},
+  url = {http://dx.doi.org/10.1103/PhysRevE.104.045307},
+  DOI = {10.1103/physreve.104.045307},
+  number = {4},
+  journal = {Physical Review E},
+  publisher = {American Physical Society (APS)},
+  author = {Barbosa,  Wendson A. S. and Griffith,  Aaron and Rowlands,  Graham E. and Govia,  Luke C. G. and Ribeill,  Guilhem J. and Nguyen,  Minh-Hai and Ohki,  Thomas A. and Gauthier,  Daniel J.},
+  year = {2021},
+  month = oct 
+}
+
+@article{Herteux2020,
+  title = {Breaking symmetries of the reservoir equations in echo state networks},
+  volume = {30},
+  ISSN = {1089-7682},
+  url = {http://dx.doi.org/10.1063/5.0028993},
+  DOI = {10.1063/5.0028993},
+  number = {12},
+  journal = {Chaos: An Interdisciplinary Journal of Nonlinear Science},
+  publisher = {AIP Publishing},
+  author = {Herteux,  Joschka and R\"{a}th,  Christoph},
+  year = {2020},
+  month = dec 
+}
\ No newline at end of file
diff --git a/src/esn/esn_inits.jl b/src/esn/esn_inits.jl
index de1f2713..d0622201 100644
--- a/src/esn/esn_inits.jl
+++ b/src/esn/esn_inits.jl
@@ -49,7 +49,8 @@ end
 
 Create and return a matrix representing a weighted input layer.
 This initializer generates a weighted input matrix with random non-zero
-elements distributed uniformly within the range [-`scaling`, `scaling`] [^lu2017].
+elements distributed uniformly within the range
+[-`scaling`, `scaling`] [Lu2017](@cite).
 
 # Arguments
 
@@ -78,11 +79,6 @@ julia> res_input = weighted_init(8, 3)
   0.0         0.0          0.0577838
   0.0         0.0         -0.0562827
 ```
-
-[^lu2017]: Lu, Zhixin, et al.
-    "Reservoir observers: Model-free inference of unmeasured variables in
-    chaotic systems."
-    Chaos: An Interdisciplinary Journal of Nonlinear Science 27.4 (2017): 041102.
 """
 function weighted_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         scaling::Number = T(0.1), return_sparse::Bool = false) where {T <: Number}
@@ -109,7 +105,7 @@ end
 Create and return a minimal weighted input layer matrix.
 This initializer generates a weighted input matrix with equal, deterministic
 elements in the same construction as [`weighted_minimal]`(@ref),
-inspired by [^lu2017].
+inspired by [Lu2017](@cite).
 
 Please note that this initializer computes its own reservoir size! If
 the computed reservoir size is different than the provided one it will raise a
@@ -188,11 +184,6 @@ julia> res_input = weighted_minimal(9, 3; sampling_type = :bernoulli_sample!)
  -0.0  -0.0   0.1
   0.0  -0.0   0.1
 ```
-
-[^lu2017]: Lu, Zhixin, et al.
-    "Reservoir observers: Model-free inference of unmeasured variables in
-    chaotic systems."
-    Chaos: An Interdisciplinary Journal of Nonlinear Science 27.4 (2017): 041102.
 """
 function weighted_minimal(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         weight::Number = T(0.1), return_sparse::Bool = false,
@@ -216,7 +207,8 @@ end
     informed_init([rng], [T], dims...;
         scaling=0.1, model_in_size, gamma=0.5)
 
-Create an input layer for informed echo state networks [^pathak2018].
+Create an input layer for informed echo state
+networks [Pathak2018](@cite).
 
 # Arguments
 
@@ -234,10 +226,6 @@ Create an input layer for informed echo state networks [^pathak2018].
   - `gamma`: The gamma value. Default is 0.5.
 
 # Examples
-
-[^pathak2018]: Pathak, Jaideep, et al. "Hybrid forecasting of chaotic processes:
-    Using machine learning in conjunction with a knowledge-based model."
-    Chaos: An Interdisciplinary Journal of Nonlinear Science 28.4 (2018).
 """
 function informed_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         scaling::Number = T(0.1), model_in_size::Integer,
@@ -281,8 +269,9 @@ end
         sampling_type=:bernoulli_sample!, weight=0.1, irrational=pi,
         start=1, p=0.5)
 
-Create a layer matrix with uniform weights determined by `weight` [^rodan2010].
-The sign difference is randomly determined by the `sampling` chosen.
+Create a layer matrix with uniform weights determined by
+`weight` [Rodan2011](@cite). The sign difference is randomly
+determined by the `sampling` chosen.
 
 # Arguments
 
@@ -358,10 +347,6 @@ julia> res_input = minimal_init(8, 3; p = 0.8)# higher p -> more positive signs
  -0.1   0.1  0.1
   0.1   0.1  0.1
 ```
-
-[^rodan2010]: Rodan, Ali, and Peter Tino.
-    "Minimum complexity echo state network."
-    IEEE transactions on neural networks 22.1 (2010): 131-144.
 """
 function minimal_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         weight::Number = T(0.1), sampling_type::Symbol = :bernoulli_sample!,
@@ -379,7 +364,8 @@ end
         amplitude=one(T), sine_divisor=one(T),
         chebyshev_parameter=one(T), return_sparse=false)
 
-Generate a Chebyshev-mapped matrix [^xie2024]. The first row is initialized
+Generate a Chebyshev-mapped matrix [Xie2024](@cite).
+The first row is initialized
 using a sine function and subsequent rows are iteratively generated
 via the Chebyshev mapping. The first row is defined as:
 
@@ -431,11 +417,6 @@ julia> input_matrix = chebyshev_mapping(10, 3)
  0.866025  0.866025  -4.37114f-8
  0.866025  0.866025  -4.37114f-8
 ```
-
-[^xie2024]: Xie, Minzhi, Qianxue Wang, and Simin Yu.
-    "Time Series Prediction of ESN Based on Chebyshev Mapping and Strongly
-    Connected Topology."
-    Neural Processing Letters 56.1 (2024): 30.
 """
 function chebyshev_mapping(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         amplitude::AbstractFloat = one(T), sine_divisor::AbstractFloat = one(T),
@@ -463,8 +444,8 @@ end
         amplitude=0.3, sine_divisor=5.9, logistic_parameter=3.7,
         return_sparse=false)
 
-Generate an input weight matrix using a logistic mapping [^wang2022].The first
-row is initialized using a sine function:
+Generate an input weight matrix using a logistic mapping [Wang2022](@cite)
+The first row is initialized using a sine function:
 
 ```math
     W[1, j] = \text{amplitude} \cdot \sin(j \cdot \pi / 
@@ -511,11 +492,6 @@ julia> logistic_mapping(8, 3)
  0.841322   0.767132  0.791346
 
 ```
-
-
-[^wang2022]: Wang, Heshan, et al. "Echo state network with logistic
-    mapping and bias dropout for time series prediction."
-    Neurocomputing 489 (2022): 196-210.
 """
 function logistic_mapping(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         amplitude::AbstractFloat = 0.3, sine_divisor::AbstractFloat = 5.9,
@@ -544,8 +520,8 @@ end
         factor, amplitude=0.3, sine_divisor=5.9, logistic_parameter=2.35,
         return_sparse=false)
 
-Generate a input weight matrix based on the logistic mapping [^viehweg2025]. The
-matrix is built so that each input is transformed into a high-dimensional feature
+Generate a input weight matrix based on the logistic mapping [Viehweg2025](@cite).
+Thematrix is built so that each input is transformed into a high-dimensional feature
 space via a recursive logistic map. For each input, a chain of weights is generated
 as follows:
 - The first element of the chain is initialized using a sine function:
@@ -614,10 +590,6 @@ julia> modified_lm(12, 4; factor=3)
   ⋅    ⋅          ⋅         0.192168
 
 ```
-
-[^viehweg2025]: Viehweg, Johannes, Constanze Poll, and Patrick Mäder.
-    "Deterministic Reservoir Computing for Chaotic Time Series Prediction."
-    arXiv preprint arXiv:2501.15615 (2025).
 """
 function modified_lm(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         factor::Integer, amplitude::AbstractFloat = 0.3,
@@ -704,7 +676,7 @@ end
         return_sparse=false)
 
 Returns an initializer to build a sparse reservoir matrix with the given
-`sparsity` by using a pseudo-SVD approach as described in [^yang2018].
+`sparsity` by using a pseudo-SVD approach as described in [Yang2018](@cite).
 
 # Arguments
 
@@ -741,10 +713,6 @@ julia> res_matrix = pseudo_svd(5, 5)
  0.0       0.0       0.0       0.726199  0.0
  0.0       0.0       0.0       0.0       1.0
 ```
-
-[^yang2018]: Yang, Cuili, et al.
-    "_Design of polynomial echo state networks for time series prediction._"
-    Neurocomputing 290 (2018): 148-160.
 """
 function pseudo_svd(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         max_value::Number = T(1.0), sparsity::Number = 0.1, sorted::Bool = true,
@@ -825,7 +793,7 @@ end
         extra_edge_probability=T(0.1), spectral_radius=one(T),
         return_sparse=false)
 
-Construct a chaotic reservoir matrix using a digital chaotic system [^xie2024].
+Construct a chaotic reservoir matrix using a digital chaotic system [Xie2024](@cite).
 
 The matrix topology is derived from a strongly connected adjacency
 matrix based on a digital chaotic system operating at finite precision.
@@ -866,11 +834,6 @@ julia> res_matrix = chaotic_init(8, 8)
    ⋅        -2.60383    ⋅        -2.90391
  -0.578156    ⋅         ⋅          ⋅
 ```
-
-[^xie2024]: Xie, Minzhi, Qianxue Wang, and Simin Yu.
-    "Time Series Prediction of ESN Based on Chebyshev Mapping and Strongly
-    Connected Topology."
-    Neural Processing Letters 56.1 (2024): 30.
 """
 function chaotic_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         extra_edge_probability::AbstractFloat = T(0.1), spectral_radius::AbstractFloat = one(T),
@@ -937,7 +900,7 @@ end
 Construct an internal reservoir connectivity matrix with low connectivity.
 
 This function creates a square reservoir matrix with the specified in-degree
-for each node [^griffith2019]. When `in_degree` is 1, the function can enforce
+for each node [Griffith2019](@cite). When `in_degree` is 1, the function can enforce
 a fully connected cycle if `connected` is `true`;
 otherwise, it generates a random connectivity pattern.
 
@@ -961,10 +924,6 @@ otherwise, it generates a random connectivity pattern.
     Defaults to 1.0.
   - `cut_cycle`: If `true`, removes one edge from the cycle to cut it.
     Default is `false`.
-
-[^griffith2019]: Griffith, Aaron, Andrew Pomerance, and Daniel J. Gauthier.
-    "Forecasting chaotic systems with very low connectivity reservoir computers."
-    Chaos: An Interdisciplinary Journal of Nonlinear Science 29.12 (2019).
 """
 function low_connectivity(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         return_sparse::Bool = false, connected::Bool = false,
@@ -1034,7 +993,7 @@ end
         weight=0.1, return_sparse=false,
         kwargs...)
 
-Create and return a delay line reservoir matrix [^rodan2010].
+Create and return a delay line reservoir matrix [Rodan2011](@cite).
 
 # Arguments
 
@@ -1089,10 +1048,6 @@ julia> res_matrix = delay_line(5, 5; weight = 1)
  0.0  0.0  1.0  0.0  0.0
  0.0  0.0  0.0  1.0  0.0
 ```
-
-[^rodan2010]: Rodan, Ali, and Peter Tino.
-    "Minimum complexity echo state network."
-    IEEE transactions on neural networks 22.1 (2010): 131-144.
 """
 function delay_line(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         weight::Union{Number, AbstractVector} = T(0.1), shift::Integer = 1,
@@ -1106,11 +1061,11 @@ end
 
 """
     delay_line_backward([rng], [T], dims...;
-        weight=0.1, fb_weight=0.2, return_sparse=false,
+        weight=0.1, fb_weight=0.1, return_sparse=false,
         delay_kwargs=(), fb_kwargs=())
 
 Create a delay line backward reservoir with the specified by `dims` and weights.
-Creates a matrix with backward connections as described in [^rodan2010].
+Creates a matrix with backward connections as described in [Rodan2011](@cite).
 
 # Arguments
 
@@ -1134,7 +1089,7 @@ Creates a matrix with backward connections as described in [^rodan2010].
     This can be provided as a single value or an array. In case it is provided as an
     array please make sure that the lenght of the array matches the lenght of the sub-diagonal
     you want to populate.
-    Default is 0.2.
+    Default is 0.1.
   - `fb_shift`: How far the backward connection will be from the diagonal.
     Default is 2.
   - `return_sparse`: flag for returning a `sparse` matrix.
@@ -1163,24 +1118,20 @@ Creates a matrix with backward connections as described in [^rodan2010].
 ```jldoctest
 julia> res_matrix = delay_line_backward(5, 5)
 5×5 Matrix{Float32}:
- 0.0  0.2  0.0  0.0  0.0
- 0.1  0.0  0.2  0.0  0.0
- 0.0  0.1  0.0  0.2  0.0
- 0.0  0.0  0.1  0.0  0.2
+ 0.0  0.1  0.0  0.0  0.0
+ 0.1  0.0  0.1  0.0  0.0
+ 0.0  0.1  0.0  0.1  0.0
+ 0.0  0.0  0.1  0.0  0.1
  0.0  0.0  0.0  0.1  0.0
 
 julia> res_matrix = delay_line_backward(Float16, 5, 5)
 5×5 Matrix{Float16}:
- 0.0  0.2  0.0  0.0  0.0
- 0.1  0.0  0.2  0.0  0.0
- 0.0  0.1  0.0  0.2  0.0
- 0.0  0.0  0.1  0.0  0.2
+ 0.0  0.1  0.0  0.0  0.0
+ 0.1  0.0  0.1  0.0  0.0
+ 0.0  0.1  0.0  0.1  0.0
+ 0.0  0.0  0.1  0.0  0.1
  0.0  0.0  0.0  0.1  0.0
 ```
-
-[^rodan2010]: Rodan, Ali, and Peter Tino.
-    "Minimum complexity echo state network."
-    IEEE transactions on neural networks 22.1 (2010): 131-144.
 """
 function delay_line_backward(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         weight::Union{Number, AbstractVector} = T(0.1),
@@ -1201,7 +1152,7 @@ end
         cycle_weight=0.1, jump_weight=0.1, jump_size=3, return_sparse=false,
         cycle_kwargs=(), jump_kwargs=())
 
-Create a cycle jumps reservoir [^Rodan2012].
+Create a cycle jumps reservoir [Rodan2012](@cite).
 
 # Arguments
 
@@ -1266,10 +1217,6 @@ julia> res_matrix = cycle_jumps(5, 5; jump_size = 2)
  0.0  0.0  0.1  0.0  0.0
  0.0  0.0  0.1  0.1  0.0
 ```
-
-[^rodan2012]: Rodan, Ali, and Peter Tiňo.
-    "Simple deterministically constructed cycle reservoirs with regular jumps."
-    Neural computation 24.7 (2012): 1822-1852.
 """
 function cycle_jumps(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         cycle_weight::Union{Number, AbstractVector} = T(0.1),
@@ -1291,7 +1238,7 @@ end
         weight=0.1, return_sparse=false,
         kwargs...)
 
-Create a simple cycle reservoir [^rodan2010].
+Create a simple cycle reservoir [Rodan2011](@cite).
 
 # Arguments
 
@@ -1344,10 +1291,6 @@ julia> res_matrix = simple_cycle(5, 5; weight = 11)
   0.0   0.0  11.0   0.0   0.0
   0.0   0.0   0.0  11.0   0.0
 ```
-
-[^rodan2010]: Rodan, Ali, and Peter Tino.
-    "Minimum complexity echo state network."
-    IEEE transactions on neural networks 22.1 (2010): 131-144.
 """
 function simple_cycle(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         weight::Union{Number, AbstractVector} = T(0.1),
@@ -1364,7 +1307,7 @@ end
         cycle_weight=0.1, second_cycle_weight=0.1,
         return_sparse=false)
 
-Creates a double cycle reservoir [^fu2023].
+Creates a double cycle reservoir [Fu2023](@cite).
 
 # Arguments
 
@@ -1393,10 +1336,6 @@ julia> reservoir_matrix = double_cycle(5, 5; cycle_weight = 0.1, second_cycle_we
  0.0  0.0  0.1  0.0  0.3
  0.1  0.0  0.0  0.1  0.0
 ```
-
-[^fu2023]: Fu, Jun, et al.
-    "A double-cycle echo state network topology for time series prediction."
-    Chaos: An Interdisciplinary Journal of Nonlinear Science 33.9 (2023).
 """
 function double_cycle(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         cycle_weight::Union{Number, AbstractVector} = T(0.1),
@@ -1423,8 +1362,8 @@ end
         cycle_weight=0.1, second_cycle_weight=0.1,
         return_sparse=false)
 
-Creates a true double cycle reservoir, ispired by [^fu2023],
-with cycles built on the definition by [^rodan2010].
+Creates a true double cycle reservoir, ispired by [Fu2023](@cite),
+with cycles built on the definition by [Rodan2011](@cite).
 
 # Arguments
 
@@ -1472,13 +1411,6 @@ julia> true_double_cycle(5, 5; cycle_weight = 0.1, second_cycle_weight = 0.3)
  0.0  0.0  0.1  0.0  0.3
  0.3  0.0  0.0  0.1  0.0
 ```
-
-[^fu2023]: Fu, Jun, et al.
-    "A double-cycle echo state network topology for time series prediction."
-    Chaos: An Interdisciplinary Journal of Nonlinear Science 33.9 (2023).
-[^rodan2010]: Rodan, Ali, and Peter Tino.
-    "Minimum complexity echo state network."
-    IEEE transactions on neural networks 22.1 (2010): 131-144.
 """
 function true_double_cycle(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         cycle_weight::Union{Number, AbstractVector} = T(0.1),
@@ -1499,7 +1431,8 @@ end
         cycle_weight=0.1, selfloop_weight=0.1,
         return_sparse=false, kwargs...)
 
-Creates a simple cycle reservoir with the addition of self loops [^elsarraj2019].
+Creates a simple cycle reservoir with the
+addition of self loops [Elsarraj2019](@cite).
 
 This architecture is referred to as TP1 in the original paper.
 
@@ -1571,10 +1504,6 @@ julia> reservoir_matrix = selfloop_cycle(5, 5; weight=0.2, selfloop_weight=0.5)
  0.0  0.0  0.2  0.5  0.0
  0.0  0.0  0.0  0.2  0.5
 ```
-
-[^elsarraj2019]: Elsarraj, Duaa, et al.
-    "Demystifying echo state network with deterministic simple topologies."
-    International Journal of Computational Science and Engineering 19.3 (2019): 407-417.
 """
 function selfloop_cycle(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         cycle_weight::Union{Number, AbstractVector} = T(0.1f0),
@@ -1594,7 +1523,7 @@ end
         return_sparse=false)
 
 Creates a cycle reservoir with feedback connections on even neurons and
-self loops on odd neurons [^elsarraj2019].
+self loops on odd neurons [Elsarraj2019](@cite).
 
 This architecture is referred to as TP2 in the original paper.
 
@@ -1649,10 +1578,6 @@ julia> reservoir_matrix = selfloop_feedback_cycle(5, 5; self_loop_weight=0.5)
  0.0  0.0  0.1  0.0  0.0
  0.0  0.0  0.0  0.1  0.5
 ```
-
-[^elsarraj2019]: Elsarraj, Duaa, et al.
-    "Demystifying echo state network with deterministic simple topologies."
-    International Journal of Computational Science and Engineering 19.3 (2019): 407-417.
 """
 function selfloop_feedback_cycle(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         cycle_weight::Union{Number, AbstractVector} = T(0.1f0),
@@ -1682,7 +1607,7 @@ end
         selfloop_kwargs=(), delay_kwargs=())
 
 Creates a reservoir based on a delay line with the addition of self loops and
-backward connections shifted by one [^elsarraj2019].
+backward connections shifted by one [Elsarraj2019](@cite).
 
 This architecture is referred to as TP3 in the original paper.
 
@@ -1763,10 +1688,6 @@ julia> reservoir_matrix = selfloop_delayline_backward(5, 5; weight=0.3)
  0.0  0.0  0.3  0.1  0.0
  0.0  0.0  0.0  0.3  0.1
 ```
-
-[^elsarraj2019]: Elsarraj, Duaa, et al.
-    "Demystifying echo state network with deterministic simple topologies."
-    International Journal of Computational Science and Engineering 19.3 (2019): 407-417.
 """
 function selfloop_delayline_backward(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         shift::Integer = 1, fb_shift::Integer = 2,
@@ -1792,7 +1713,7 @@ end
         delay_kwargs=())
 
 Creates a reservoir based on a forward connection of weights between even nodes
-with the addition of self loops [^elsarraj2019].
+with the addition of self loops [Elsarraj2019](@cite).
 
 This architecture is referred to as TP4 in the original paper.
 
@@ -1865,10 +1786,6 @@ julia> reservoir_matrix = selfloop_forward_connection(5, 5; weight=0.5)
  0.0  0.5  0.0  0.1  0.0
  0.0  0.0  0.5  0.0  0.1
 ```
-
-[^elsarraj2019]: Elsarraj, Duaa, et al.
-    "Demystifying echo state network with deterministic simple topologies."
-    International Journal of Computational Science and Engineering 19.3 (2019): 407-417.
 """
 function selfloop_forward_connection(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         weight::Union{Number, AbstractVector} = T(0.1f0),
@@ -1888,7 +1805,7 @@ end
         weight=0.1, selfloop_weight=0.1,
         return_sparse=false)
 
-Creates a reservoir based on a forward connection of weights [^elsarraj2019].
+Creates a reservoir based on a forward connection of weights [Elsarraj2019](@cite).
 
 This architecture is referred to as TP5 in the original paper.
 
@@ -1953,10 +1870,6 @@ julia> reservoir_matrix = forward_connection(5, 5; weight=0.5)
  0.0  0.5  0.0  0.0  0.0
  0.0  0.0  0.5  0.0  0.0
 ```
-
-[^elsarraj2019]: Elsarraj, Duaa, et al.
-    "Demystifying echo state network with deterministic simple topologies."
-    International Journal of Computational Science and Engineering 19.3 (2019): 407-417.
 """
 function forward_connection(rng::AbstractRNG, ::Type{T}, dims::Integer...;
         weight::Union{Number, AbstractVector} = T(0.1f0), return_sparse::Bool = false,
diff --git a/src/esn/esn_reservoir_drivers.jl b/src/esn/esn_reservoir_drivers.jl
index f16e849a..35e216a0 100644
--- a/src/esn/esn_reservoir_drivers.jl
+++ b/src/esn/esn_reservoir_drivers.jl
@@ -150,7 +150,7 @@ end
         scaling_factor=fill(leaky_coefficient, length(activation_function)))
 
 Returns a Multiple RNN (MRNN) initializer for the Echo State Network (ESN),
-introduced in [^Lun2015].
+introduced in [Lun2015](@cite).
 
 # Arguments
 
@@ -173,10 +173,6 @@ introduced in [^Lun2015].
 This function creates an MRNN object with the specified activation functions,
 leaky coefficient, and scaling factors, which can be used as a reservoir driver
 in the ESN.
-
-[^Lun2015]: Lun, Shu-Xian, et al.
-    "_A novel model of leaky integrator echo state network for
-    time-series prediction._" Neurocomputing 159 (2015): 58-66.
 """
 function MRNN(; activation_function = [tanh, sigmoid],
         leaky_coefficient = 1.0,
@@ -222,24 +218,15 @@ end
 Returns a Fully Gated Recurrent Unit (FullyGated) initializer
 for the Echo State Network (ESN).
 
-Returns the standard gated recurrent unit [^Cho2014] as a driver for the
+Returns the standard gated recurrent unit [Cho2014](@cite) as a driver for the
 echo state network (`ESN`).
-
-[^Cho2014]: Cho, Kyunghyun, et al.
-    "_Learning phrase representations using RNN encoder-decoder
-    for statistical machine translation._"
-    arXiv preprint arXiv:1406.1078 (2014).
 """
 struct FullyGated <: AbstractGRUVariant end
 
 """
     Minimal()
 
-Returns a minimal GRU ESN initializer as described in [^Zhou2016].
-
-[^Zhou2016]: Zhou, Guo-Bing, et al. "_Minimal gated unit for recurrent
-    neural networks._"
-    International Journal of Automation and Computing 13.3 (2016): 226-234.
+Returns a minimal GRU ESN initializer.
 """
 struct Minimal <: AbstractGRUVariant end
 
@@ -252,7 +239,7 @@ struct Minimal <: AbstractGRUVariant end
         variant = FullyGated())
 
 Returns a Gated Recurrent Unit (GRU) reservoir driver for Echo State Network (`ESN`).
-This driver is based on the GRU architecture [^Cho2014].
+This driver is based on the GRU architecture [Cho2014](@cite).
 
 # Arguments
 
@@ -267,10 +254,6 @@ This driver is based on the GRU architecture [^Cho2014].
     By default, it uses two dense layers.
   - `variant`: The GRU variant to use.
     By default, it uses the "FullyGated" variant.
-
-[^Cho2014]: Cho, Kyunghyun, et al.
-    "_Learning phrase representations using RNN encoder-decoder for statistical machine translation._"
-    arXiv preprint arXiv:1406.1078 (2014).
 """
 function GRU(; activation_function = [sigmoid, sigmoid, tanh],
         inner_layer = fill(scaled_rand, 2),
diff --git a/src/states.jl b/src/states.jl
index a0e85123..177dd550 100644
--- a/src/states.jl
+++ b/src/states.jl
@@ -372,7 +372,7 @@ struct NLADefault <: NonLinearAlgorithm end
     NLAT1()
 
 `NLAT1` implements the T₁ transformation algorithm introduced
-in [^Chattopadhyay] and [^Pathak]. The T₁ algorithm squares
+in [Chattopadhyay2020](@cite) and [Pathak2017](@cite). The T₁ algorithm squares
 elements of the input array, targeting every second row.
 
 
@@ -442,16 +442,6 @@ julia> mat_new = nlat(mat_old)
  361  400  441
 
 ```
-
-[^Chattopadhyay]: Chattopadhyay, Ashesh, et al.
-    "Data-driven prediction of a multi-scale Lorenz 96 chaotic system using a
-    hierarchy of deep learning methods: Reservoir computing, ANN, and RNN-LSTM."
-    (2019).
-
-[^Pathak]: Pathak, Jaideep, et al.
-    "Model-free prediction of large spatiotemporally chaotic systems
-    from data: A reservoir computing approach."
-    Physical review letters 120.2 (2018): 024102.
 """
 struct NLAT1 <: NonLinearAlgorithm end
 
@@ -471,7 +461,7 @@ end
     NLAT2()
 
 `NLAT2` implements the T₂ transformation algorithm as defined
-in [^Chattopadhyay]. This transformation algorithm modifies the
+in [Chattopadhyay2020](@cite). This transformation algorithm modifies the
 reservoir states by multiplying each odd-indexed row
 (starting from the second row) with the product of its two preceding rows.
 
@@ -541,11 +531,6 @@ julia> mat_new = nlat(mat_old)
  19  20   21
 
 ```
-
-[^Chattopadhyay]: Chattopadhyay, Ashesh, et al.
-    "Data-driven prediction of a multi-scale Lorenz 96 chaotic system using a
-    hierarchy of deep learning methods: Reservoir computing, ANN, and RNN-LSTM."
-    (2019).
 """
 struct NLAT2 <: NonLinearAlgorithm end
 
@@ -565,7 +550,7 @@ end
     NLAT3()
 
 Implements the T₃ transformation algorithm as detailed
-in [^Chattopadhyay]. This algorithm modifies the reservoir's states by
+in [Chattopadhyay2020](@cite). This algorithm modifies the reservoir's states by
 multiplying each odd-indexed row (beginning from the second row) with the
 product of the immediately preceding and the immediately following rows.
 
@@ -635,11 +620,6 @@ julia> mat_new = nlat(mat_old)
   19   20   21
 
 ```
-
-[^Chattopadhyay]: Chattopadhyay, Ashesh, et al.
-    "Data-driven predictions of a multiscale Lorenz 96 chaotic system using
-    machine-learning methods: reservoir computing, artificial neural network,
-    and long short-term memory network." (2019).
 """
 struct NLAT3 <: NonLinearAlgorithm end
 
@@ -658,7 +638,7 @@ end
 @doc raw"""
     PartialSquare(eta)
 
-Implement a partial squaring of the states as described in [^barbosa2021].
+Implement a partial squaring of the states as described in [Barbosa2021](@cite).
 
 # Equations
 
@@ -704,11 +684,7 @@ julia> x_new = ps(x_old)
   7
   8
   9
-
-
-[^barbosa2021]: Barbosa, Wendson AS, et al.
-    "Symmetry-aware reservoir computing."
-    Physical Review E 104.4 (2021): 045307.
+```
 """
 struct PartialSquare <: NonLinearAlgorithm
     eta::Number
@@ -731,7 +707,7 @@ end
 
     ExtendedSquare()
 
-Extension of the Lu initialization proposed in [^herteux2020].
+Extension of the Lu initialization proposed in [Herteux2020](@cite).
 The state vector is extended with the squared elements of the initial
 state
 
@@ -783,10 +759,6 @@ julia> x_new = es(x_old)
  81
 
 ```
-
-[^herteux2020]: Herteux, Joschka, and Christoph Räth.
-    "Breaking symmetries of the reservoir equations in echo state networks."
-    Chaos: An Interdisciplinary Journal of Nonlinear Science 30.12 (2020).
 """
 struct ExtendedSquare <: NonLinearAlgorithm end