@@ -62,10 +62,14 @@ def __init__(
6262 providing a single summary of the time series which then attends to each point in the time series pro-
6363 cessed via a series of ``num_attention_blocks`` self-attention layers.
6464
65- Important: Assumes that positional encodings have been appended to the input time series.
65+ Important: Assumes that positional encodings have been appended to the input time series, e.g.,
66+ through a custom configurator.
6667
67- Recommnded: When using transformers as summary networks, you may want to use a smaller learning rate
68- during training, e.g., setting ``default_lr=1e-5`` in a ``Trainer`` instance.
68+ Recommended: When using transformers as summary networks, you may want to use a smaller learning rate
69+ during training, e.g., setting ``default_lr=5e-5`` in a ``Trainer`` instance.
70+
71+ Layer normalization (controllable through the ``use_layer_norm`` keyword argument) may not always work
72+ well in certain applications. Consider setting it to ``False`` if the network is underperforming.
6973
7074 Parameters
7175 ----------
@@ -133,8 +137,12 @@ def __init__(
133137
134138 # Construct final attention layer, which will perform cross-attention
135139 # between the outputs ot the self-attention layers and the dynamic template
140+ if bidirectional :
141+ final_input_dim = template_dim * 2
142+ else :
143+ final_input_dim = template_dim
136144 self .output_attention = MultiHeadAttentionBlock (
137- template_dim , attention_settings , num_dense_fc , dense_settings , use_layer_norm
145+ final_input_dim , attention_settings , num_dense_fc , dense_settings , use_layer_norm
138146 )
139147
140148 # A recurrent network will learn the dynamic many-to-one template
0 commit comments