Improve transformer doc [skip ci]

stefanradev93 · stefanradev93 · commit 557a08d1bcfb · 2025-04-11T11:08:36.000-04:00
diff --git a/bayesflow/networks/transformers/mab.py b/bayesflow/networks/transformers/mab.py
@@ -10,9 +10,17 @@
 class MultiHeadAttentionBlock(keras.Layer):
     """Implements the MAB block from [1] which represents learnable cross-attention.
 
+    In particular, it uses a so-called "Post-LN" transformer block [2] which applies
+    layer norm following attention and following MLP. A "Pre-LN" transformer block
+    can easily be implemented.
+
     [1] Lee, J., Lee, Y., Kim, J., Kosiorek, A., Choi, S., & Teh, Y. W. (2019).
         Set transformer: A framework for attention-based permutation-invariant neural networks.
         In International conference on machine learning (pp. 3744-3753). PMLR.
+
+    [2] Xiong, R., Yang, Y., He, D., Zheng, K., Zheng, S., Xing, C., ... & Liu, T. (2020, November).
+    On layer normalization in the transformer architecture.
+    In International conference on machine learning (pp. 10524-10533). PMLR.
     """
 
     def __init__(