diff --git a/nbs/common.modules.ipynb b/nbs/common.modules.ipynb
index f90e936da..3b5d534e0 100644
--- a/nbs/common.modules.ipynb
+++ b/nbs/common.modules.ipynb
@@ -35,6 +35,7 @@
    "source": [
     "#| export\n",
     "import math\n",
+    "import numpy as np\n",
     "\n",
     "import torch\n",
     "import torch.nn as nn\n",
@@ -409,40 +410,93 @@
    "source": [
     "#| export\n",
     "class AttentionLayer(nn.Module):\n",
-    "    def __init__(self, attention, hidden_size, n_head, d_keys=None,\n",
+    "    def __init__(self, attention, hidden_size, n_heads, d_keys=None,\n",
     "                 d_values=None):\n",
     "        super(AttentionLayer, self).__init__()\n",
     "\n",
-    "        d_keys = d_keys or (hidden_size // n_head)\n",
-    "        d_values = d_values or (hidden_size // n_head)\n",
+    "        d_keys = d_keys or (hidden_size // n_heads)\n",
+    "        d_values = d_values or (hidden_size // n_heads)\n",
     "\n",
     "        self.inner_attention = attention\n",
-    "        self.query_projection = nn.Linear(hidden_size, d_keys * n_head)\n",
-    "        self.key_projection = nn.Linear(hidden_size, d_keys * n_head)\n",
-    "        self.value_projection = nn.Linear(hidden_size, d_values * n_head)\n",
-    "        self.out_projection = nn.Linear(d_values * n_head, hidden_size)\n",
-    "        self.n_head = n_head\n",
+    "        self.query_projection = nn.Linear(hidden_size, d_keys * n_heads)\n",
+    "        self.key_projection = nn.Linear(hidden_size, d_keys * n_heads)\n",
+    "        self.value_projection = nn.Linear(hidden_size, d_values * n_heads)\n",
+    "        self.out_projection = nn.Linear(d_values * n_heads, hidden_size)\n",
+    "        self.n_heads = n_heads\n",
     "\n",
-    "    def forward(self, queries, keys, values, attn_mask):\n",
+    "    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):\n",
     "        B, L, _ = queries.shape\n",
     "        _, S, _ = keys.shape\n",
-    "        H = self.n_head\n",
+    "        H = self.n_heads\n",
     "\n",
     "        queries = self.query_projection(queries).view(B, L, H, -1)\n",
     "        keys = self.key_projection(keys).view(B, S, H, -1)\n",
     "        values = self.value_projection(values).view(B, S, H, -1)\n",
     "\n",
     "        out, attn = self.inner_attention(\n",
-    "            queries,\n",
-    "            keys,\n",
-    "            values,\n",
-    "            attn_mask\n",
+    "            queries=queries,\n",
+    "            keys=keys,\n",
+    "            values=values,\n",
+    "            attn_mask=attn_mask,\n",
+    "            tau=tau,\n",
+    "            delta=delta\n",
     "        )\n",
     "        out = out.view(B, L, -1)\n",
     "\n",
     "        return self.out_projection(out), attn"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "\n",
+    "class TriangularCausalMask():\n",
+    "    \"\"\"\n",
+    "    TriangularCausalMask\n",
+    "    \"\"\"      \n",
+    "    def __init__(self, B, L, device=\"cpu\"):\n",
+    "        mask_shape = [B, 1, L, L]\n",
+    "        with torch.no_grad():\n",
+    "            self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device)\n",
+    "\n",
+    "    @property\n",
+    "    def mask(self):\n",
+    "        return self._mask\n",
+    "\n",
+    "class FullAttention(nn.Module):\n",
+    "    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):\n",
+    "        super(FullAttention, self).__init__()\n",
+    "        self.scale = scale\n",
+    "        self.mask_flag = mask_flag\n",
+    "        self.output_attention = output_attention\n",
+    "        self.dropout = nn.Dropout(attention_dropout)\n",
+    "\n",
+    "    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):\n",
+    "        B, L, H, E = queries.shape\n",
+    "        _, S, _, D = values.shape\n",
+    "        scale = self.scale or 1. / math.sqrt(E)\n",
+    "\n",
+    "        scores = torch.einsum(\"blhe,bshe->bhls\", queries, keys)\n",
+    "\n",
+    "        if self.mask_flag:\n",
+    "            if attn_mask is None:\n",
+    "                attn_mask = TriangularCausalMask(B, L, device=queries.device)\n",
+    "\n",
+    "            scores.masked_fill_(attn_mask.mask, -np.inf)\n",
+    "\n",
+    "        A = self.dropout(torch.softmax(scale * scores, dim=-1))\n",
+    "        V = torch.einsum(\"bhls,bshd->blhd\", A, values)\n",
+    "\n",
+    "        if self.output_attention:\n",
+    "            return V.contiguous(), A\n",
+    "        else:\n",
+    "            return V.contiguous(), None     "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -570,6 +624,26 @@
     "        if self.temporal_embedding is not None:\n",
     "            x = x + self.temporal_embedding(x_mark)            \n",
     "\n",
+    "        return self.dropout(x)\n",
+    "\n",
+    "class DataEmbedding_inverted(nn.Module):\n",
+    "    \"\"\"\n",
+    "    DataEmbedding_inverted\n",
+    "    \"\"\"       \n",
+    "    def __init__(self, c_in, hidden_size, dropout=0.1):\n",
+    "        super(DataEmbedding_inverted, self).__init__()\n",
+    "        self.value_embedding = nn.Linear(c_in, hidden_size)\n",
+    "        self.dropout = nn.Dropout(p=dropout)\n",
+    "\n",
+    "    def forward(self, x, x_mark):\n",
+    "        x = x.permute(0, 2, 1)\n",
+    "        # x: [Batch Variate Time]\n",
+    "        if x_mark is None:\n",
+    "            x = self.value_embedding(x)\n",
+    "        else:\n",
+    "            # the potential to take covariates (e.g. timestamps) as tokens\n",
+    "            x = self.value_embedding(torch.cat([x, x_mark.permute(0, 2, 1)], 1)) \n",
+    "        # x: [Batch Variate hidden_size]\n",
     "        return self.dropout(x)"
    ]
   },
diff --git a/nbs/core.ipynb b/nbs/core.ipynb
index 5138b5e68..6e663ded2 100644
--- a/nbs/core.ipynb
+++ b/nbs/core.ipynb
@@ -93,7 +93,7 @@
     "    StemGNN, PatchTST, TimesNet, TimeLLM, TSMixer, TSMixerx,\n",
     "    MLPMultivariate, iTransformer,\n",
     "    BiTCN, TiDE, DeepNPTS, SOFTS,\n",
-    "    TimeMixer, KAN, RMoK\n",
+    "    TimeMixer, KAN, RMoK, TimeXer\n",
     ")\n",
     "from neuralforecast.common._base_auto import BaseAuto, MockTrial\n",
     "from neuralforecast.utils import PredictionIntervals, get_prediction_interval_method"
@@ -247,7 +247,8 @@
     "    'softs': SOFTS, 'autosofts': SOFTS,\n",
     "    'timemixer': TimeMixer, 'autotimemixer': TimeMixer,\n",
     "    'kan': KAN, 'autokan': KAN,\n",
-    "    'rmok': RMoK, 'autormok': RMoK\n",
+    "    'rmok': RMoK, 'autormok': RMoK,\n",
+    "    'timexer': TimeXer, 'autotimexer': TimeXer\n",
     "}"
    ]
   },
diff --git a/nbs/docs/capabilities/01_overview.ipynb b/nbs/docs/capabilities/01_overview.ipynb
index 11b964a7f..7c968c338 100644
--- a/nbs/docs/capabilities/01_overview.ipynb
+++ b/nbs/docs/capabilities/01_overview.ipynb
@@ -43,7 +43,8 @@
     "|`TiDE` | `AutoTiDE` | MLP | Univariate | Direct  | F/H/S | \n",
     "|`TimeMixer` | `AutoTimeMixer` | MLP | Multivariate | Direct  | - | \n",
     "|`TimeLLM` | - | LLM | Univariate | Direct  | - | \n",
-    "|`TimesNet` | `AutoTimesNet` | CNN | Univariate | Direct  | F | \n",
+    "|`TimesNet` | `AutoTimesNet` | CNN | Univariate | Direct  | F |\n",
+    "|`TimeXer` | `AutoTimeXer` | Transformer | Multivariate | Direct  | F | \n",
     "|`TSMixer` | `AutoTSMixer` | MLP | Multivariate | Direct  | - | \n",
     "|`TSMixerx` | `AutoTSMixerx` | MLP | Multivariate | Direct  | F/H/S | \n",
     "|`VanillaTransformer` | `AutoVanillaTransformer` | Transformer | Univariate | Direct  | F | \n",
diff --git a/nbs/imgs_models/timexer.png b/nbs/imgs_models/timexer.png
new file mode 100644
index 000000000..99a6e22ab
Binary files /dev/null and b/nbs/imgs_models/timexer.png differ
diff --git a/nbs/mint.json b/nbs/mint.json
index b07f8ae33..18f868293 100644
--- a/nbs/mint.json
+++ b/nbs/mint.json
@@ -131,6 +131,7 @@
             "models.timellm.html",
             "models.timemixer.html",
             "models.timesnet.html",
+            "models.timexer.html",
             "models.tsmixer.html",
             "models.tsmixerx.html",
             "models.vanillatransformer.html"
diff --git a/nbs/models.informer.ipynb b/nbs/models.informer.ipynb
index dffc74e8b..74264b55f 100644
--- a/nbs/models.informer.ipynb
+++ b/nbs/models.informer.ipynb
@@ -216,7 +216,7 @@
     "        else:\n",
     "            return (context_in, None)\n",
     "\n",
-    "    def forward(self, queries, keys, values, attn_mask):\n",
+    "    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):\n",
     "        B, L_Q, H, D = queries.shape\n",
     "        _, L_K, _, _ = keys.shape\n",
     "\n",
diff --git a/nbs/models.ipynb b/nbs/models.ipynb
index 018525399..012190712 100644
--- a/nbs/models.ipynb
+++ b/nbs/models.ipynb
@@ -64,6 +64,7 @@
     "from neuralforecast.models.patchtst import PatchTST\n",
     "from neuralforecast.models.timesnet import TimesNet\n",
     "from neuralforecast.models.itransformer import iTransformer\n",
+    "from neuralforecast.models.timexer import TimeXer\n",
     "\n",
     "from neuralforecast.models.kan import KAN\n",
     "from neuralforecast.models.rmok import RMoK\n",
@@ -3430,6 +3431,157 @@
     "model.fit(dataset=dataset)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34660732",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class AutoTimeXer(BaseAuto):\n",
+    "\n",
+    "    default_config = {\n",
+    "        \"input_size_multiplier\": [1, 2, 3, 4, 5],\n",
+    "        \"h\": None,\n",
+    "        \"n_series\": None,\n",
+    "        \"hidden_size\": tune.choice([128, 256, 512]),\n",
+    "        \"n_heads\": tune.choice([4, 8]),\n",
+    "        \"learning_rate\": tune.loguniform(1e-4, 1e-1),\n",
+    "        \"scaler_type\": tune.choice([None, 'robust', 'standard']),\n",
+    "        \"max_steps\": tune.choice([500, 1000, 2000]),\n",
+    "        \"batch_size\": tune.choice([32, 64, 128, 256]),\n",
+    "        \"loss\": None,\n",
+    "        \"random_seed\": tune.randint(1, 20),\n",
+    "    }\n",
+    "\n",
+    "    def __init__(self,\n",
+    "                 h,\n",
+    "                 n_series,\n",
+    "                 loss=MAE(),\n",
+    "                 valid_loss=None,\n",
+    "                 config=None, \n",
+    "                 search_alg=BasicVariantGenerator(random_state=1),\n",
+    "                 num_samples=10,\n",
+    "                 refit_with_val=False,\n",
+    "                 cpus=cpu_count(),\n",
+    "                 gpus=torch.cuda.device_count(),\n",
+    "                 verbose=False,\n",
+    "                 alias=None,\n",
+    "                 backend='ray',\n",
+    "                 callbacks=None):\n",
+    "        \n",
+    "        # Define search space, input/output sizes\n",
+    "        if config is None:\n",
+    "            config = self.get_default_config(h=h, backend=backend, n_series=n_series)                 \n",
+    "\n",
+    "        # Always use n_series from parameters, raise exception with Optuna because we can't enforce it\n",
+    "        if backend == 'ray':\n",
+    "            config['n_series'] = n_series\n",
+    "        elif backend == 'optuna':\n",
+    "            mock_trial = MockTrial()\n",
+    "            if ('n_series' in config(mock_trial) and config(mock_trial)['n_series'] != n_series) or ('n_series' not in config(mock_trial)):\n",
+    "                raise Exception(f\"config needs 'n_series': {n_series}\")           \n",
+    "\n",
+    "        super(AutoTimeXer, self).__init__(\n",
+    "              cls_model=TimeXer, \n",
+    "              h=h,\n",
+    "              loss=loss,\n",
+    "              valid_loss=valid_loss,\n",
+    "              config=config,\n",
+    "              search_alg=search_alg,\n",
+    "              num_samples=num_samples, \n",
+    "              refit_with_val=refit_with_val,\n",
+    "              cpus=cpus,\n",
+    "              gpus=gpus,\n",
+    "              verbose=verbose,\n",
+    "              alias=alias,\n",
+    "              backend=backend,\n",
+    "              callbacks=callbacks,            \n",
+    "        )\n",
+    "\n",
+    "    @classmethod\n",
+    "    def get_default_config(cls, h, backend, n_series):\n",
+    "        config = cls.default_config.copy()        \n",
+    "        config['input_size'] = tune.choice([h * x \\\n",
+    "                        for x in config[\"input_size_multiplier\"]])\n",
+    "\n",
+    "        # Rolling windows with step_size=1 or step_size=h\n",
+    "        # See `BaseWindows` and `BaseRNN`'s create_windows\n",
+    "        config['step_size'] = tune.choice([1, h])\n",
+    "        del config[\"input_size_multiplier\"]\n",
+    "        if backend == 'optuna':\n",
+    "            # Always use n_series from parameters\n",
+    "            config['n_series'] = n_series\n",
+    "            config = cls._ray_config_to_optuna(config)           \n",
+    "\n",
+    "        return config         "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de761efc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(AutoTimeXer, title_level=3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f08f23a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "# Use your own config or AutoTimeXer.default_config\n",
+    "config = dict(max_steps=1, val_check_steps=1, input_size=12, patch_len=12)\n",
+    "model = AutoTimeXer(h=12, n_series=1, config=config, num_samples=1, cpus=1)\n",
+    "\n",
+    "# Fit and predict\n",
+    "model.fit(dataset=dataset)\n",
+    "y_hat = model.predict(dataset=dataset)\n",
+    "\n",
+    "# Optuna\n",
+    "model = AutoTimeXer(h=12, n_series=1, config=None, backend='optuna')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8488c991",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "# Check Optuna\n",
+    "assert model.config(MockTrial())['h'] == 12\n",
+    "\n",
+    "# Unit test to test that Auto* model contains all required arguments from BaseAuto\n",
+    "test_args(AutoTimeXer, exclude_args=['cls_model']) \n",
+    "\n",
+    "# Unit test for situation: Optuna with updated default config\n",
+    "my_config = AutoTimeXer.get_default_config(h=12, n_series=1, backend='optuna')\n",
+    "def my_config_new(trial):\n",
+    "    config = {**my_config(trial)}\n",
+    "    config.update({'max_steps': 1, 'val_check_steps': 1, 'input_size': 12, 'patch_len': 12})\n",
+    "    return config\n",
+    "\n",
+    "model = AutoTimeXer(h=12, n_series=1, config=my_config_new, backend='optuna', num_samples=1, cpus=1)\n",
+    "model.fit(dataset=dataset)\n",
+    "\n",
+    "# Unit test for situation: Ray with updated default config\n",
+    "my_config = AutoTimeXer.get_default_config(h=12, n_series=1, backend='ray')\n",
+    "my_config['max_steps'] = 1\n",
+    "my_config['val_check_steps'] = 1\n",
+    "my_config['input_size'] = 12\n",
+    "my_config['patch_len'] = 12\n",
+    "model = AutoTimeXer(h=12, n_series=1, config=my_config, backend='ray', num_samples=1, cpus=1)\n",
+    "model.fit(dataset=dataset)"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
diff --git a/nbs/models.itransformer.ipynb b/nbs/models.itransformer.ipynb
index e8b6f15d7..c54e2a98e 100644
--- a/nbs/models.itransformer.ipynb
+++ b/nbs/models.itransformer.ipynb
@@ -66,125 +66,23 @@
     "import torch.nn as nn\n",
     "import torch.nn.functional as F\n",
     "\n",
-    "import numpy as np\n",
-    "\n",
-    "from math import sqrt\n",
-    "\n",
     "from neuralforecast.losses.pytorch import MAE\n",
     "from neuralforecast.common._base_multivariate import BaseMultivariate\n",
     "\n",
-    "from neuralforecast.common._modules import TransEncoder, TransEncoderLayer, AttentionLayer"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 1. Auxiliary functions"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1.1 Attention"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| export\n",
-    "\n",
-    "class TriangularCausalMask():\n",
-    "    \"\"\"\n",
-    "    TriangularCausalMask\n",
-    "    \"\"\"      \n",
-    "    def __init__(self, B, L, device=\"cpu\"):\n",
-    "        mask_shape = [B, 1, L, L]\n",
-    "        with torch.no_grad():\n",
-    "            self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device)\n",
-    "\n",
-    "    @property\n",
-    "    def mask(self):\n",
-    "        return self._mask\n",
-    "\n",
-    "class FullAttention(nn.Module):\n",
-    "    \"\"\"\n",
-    "    FullAttention\n",
-    "    \"\"\"  \n",
-    "    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):\n",
-    "        super(FullAttention, self).__init__()\n",
-    "        self.scale = scale\n",
-    "        self.mask_flag = mask_flag\n",
-    "        self.output_attention = output_attention\n",
-    "        self.dropout = nn.Dropout(attention_dropout)\n",
-    "\n",
-    "    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):\n",
-    "        B, L, H, E = queries.shape\n",
-    "        _, S, _, D = values.shape\n",
-    "        scale = self.scale or 1. / sqrt(E)\n",
-    "\n",
-    "        scores = torch.einsum(\"blhe,bshe->bhls\", queries, keys)\n",
-    "\n",
-    "        if self.mask_flag:\n",
-    "            if attn_mask is None:\n",
-    "                attn_mask = TriangularCausalMask(B, L, device=queries.device)\n",
-    "\n",
-    "            scores.masked_fill_(attn_mask.mask, -np.inf)\n",
-    "\n",
-    "        A = self.dropout(torch.softmax(scale * scores, dim=-1))\n",
-    "        V = torch.einsum(\"bhls,bshd->blhd\", A, values)\n",
-    "\n",
-    "        if self.output_attention:\n",
-    "            return (V.contiguous(), A)\n",
-    "        else:\n",
-    "            return (V.contiguous(), None)      "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1.2 Inverted embedding"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| export\n",
-    "\n",
-    "class DataEmbedding_inverted(nn.Module):\n",
-    "    \"\"\"\n",
+    "from neuralforecast.common._modules import (\n",
+    "    TransEncoder, \n",
+    "    TransEncoderLayer, \n",
+    "    AttentionLayer, \n",
+    "    FullAttention, \n",
     "    DataEmbedding_inverted\n",
-    "    \"\"\"       \n",
-    "    def __init__(self, c_in, hidden_size, dropout=0.1):\n",
-    "        super(DataEmbedding_inverted, self).__init__()\n",
-    "        self.value_embedding = nn.Linear(c_in, hidden_size)\n",
-    "        self.dropout = nn.Dropout(p=dropout)\n",
-    "\n",
-    "    def forward(self, x, x_mark):\n",
-    "        x = x.permute(0, 2, 1)\n",
-    "        # x: [Batch Variate Time]\n",
-    "        if x_mark is None:\n",
-    "            x = self.value_embedding(x)\n",
-    "        else:\n",
-    "            # the potential to take covariates (e.g. timestamps) as tokens\n",
-    "            x = self.value_embedding(torch.cat([x, x_mark.permute(0, 2, 1)], 1)) \n",
-    "        # x: [Batch Variate hidden_size]\n",
-    "        return self.dropout(x)"
+    ")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 2. Model"
+    "# 1. Model"
    ]
   },
   {
@@ -410,7 +308,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 3. Usage example"
+    "# 2. Usage example"
    ]
   },
   {
diff --git a/nbs/models.timexer.ipynb b/nbs/models.timexer.ipynb
new file mode 100644
index 000000000..44565dce5
--- /dev/null
+++ b/nbs/models.timexer.ipynb
@@ -0,0 +1,528 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| default_exp models.timexer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "from nbdev.showdoc import show_doc"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# TimeXer\n",
+    "\n",
+    "TimeXer empowers the canonical Transformer with the ability to reconcile endogenous and exogenous information, where patch-wise self-attention and variate-wise cross-attention are used simultaneously.\n",
+    "\n",
+    "**References**\n",
+    "- [Yuxuan Wang, Haixu Wu, Jiaxiang Dong, Guo Qin, Haoran Zhang, Yong Liu, Yunzhong Qiu, Jianmin Wang, Mingsheng Long. \"TimeXer: Empowering Transformers for Time Series Forecasting with Exogenous Variables\"](https://arxiv.org/abs/2402.19072)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Figure 1. Architecture of TimeXer.](imgs_models/timexer.png)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "from neuralforecast.losses.pytorch import MAE\n",
+    "from neuralforecast.common._base_multivariate import BaseMultivariate\n",
+    "from neuralforecast.common._modules import (\n",
+    "    DataEmbedding_inverted, \n",
+    "    PositionalEmbedding,\n",
+    "    FullAttention,\n",
+    "    AttentionLayer\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1. Auxiliary functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class FlattenHead(nn.Module):\n",
+    "    def __init__(self, n_vars, nf, target_window, head_dropout=0):\n",
+    "        super().__init__()\n",
+    "        self.n_vars = n_vars\n",
+    "        self.flatten = nn.Flatten(start_dim=-2)\n",
+    "        self.linear = nn.Linear(nf, target_window)\n",
+    "        self.dropout = nn.Dropout(head_dropout)\n",
+    "\n",
+    "    def forward(self, x):  # x: [bs x nvars x d_model x patch_num]\n",
+    "        x = self.flatten(x)\n",
+    "        x = self.linear(x)\n",
+    "        x = self.dropout(x)\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class Encoder(nn.Module):\n",
+    "    def __init__(self, layers, norm_layer=None, projection=None):\n",
+    "        super(Encoder, self).__init__()\n",
+    "        self.layers = nn.ModuleList(layers)\n",
+    "        self.norm = norm_layer\n",
+    "        self.projection = projection\n",
+    "\n",
+    "    def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None):\n",
+    "        for layer in self.layers:\n",
+    "            x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask, tau=tau, delta=delta)\n",
+    "\n",
+    "        if self.norm is not None:\n",
+    "            x = self.norm(x)\n",
+    "\n",
+    "        if self.projection is not None:\n",
+    "            x = self.projection(x)\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class EncoderLayer(nn.Module):\n",
+    "    def __init__(self, self_attention, cross_attention, d_model, d_ff=None,\n",
+    "                 dropout=0.1, activation=\"relu\"):\n",
+    "        super(EncoderLayer, self).__init__()\n",
+    "        d_ff = d_ff or 4 * d_model\n",
+    "        self.self_attention = self_attention\n",
+    "        self.cross_attention = cross_attention\n",
+    "        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)\n",
+    "        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)\n",
+    "        self.norm1 = nn.LayerNorm(d_model)\n",
+    "        self.norm2 = nn.LayerNorm(d_model)\n",
+    "        self.norm3 = nn.LayerNorm(d_model)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "        self.activation = F.relu if activation == \"relu\" else F.gelu\n",
+    "\n",
+    "    def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None):\n",
+    "        B, L, D = cross.shape\n",
+    "        x = x + self.dropout(self.self_attention(\n",
+    "            x, x, x,\n",
+    "            attn_mask=x_mask,\n",
+    "            tau=tau, delta=None\n",
+    "        )[0])\n",
+    "        x = self.norm1(x)\n",
+    "\n",
+    "        x_glb_ori = x[:, -1, :].unsqueeze(1)\n",
+    "        x_glb = torch.reshape(x_glb_ori, (B, -1, D))\n",
+    "        x_glb_attn = self.dropout(self.cross_attention(\n",
+    "            x_glb, cross, cross,\n",
+    "            attn_mask=cross_mask,\n",
+    "            tau=tau, delta=delta\n",
+    "        )[0])\n",
+    "        x_glb_attn = torch.reshape(x_glb_attn,\n",
+    "                                   (x_glb_attn.shape[0] * x_glb_attn.shape[1], x_glb_attn.shape[2])).unsqueeze(1)\n",
+    "        x_glb = x_glb_ori + x_glb_attn\n",
+    "        x_glb = self.norm2(x_glb)\n",
+    "\n",
+    "        y = x = torch.cat([x[:, :-1, :], x_glb], dim=1)\n",
+    "\n",
+    "        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))\n",
+    "        y = self.dropout(self.conv2(y).transpose(-1, 1))\n",
+    "\n",
+    "        return self.norm3(x + y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "class EnEmbedding(nn.Module):\n",
+    "    def __init__(self, n_vars, d_model, patch_len, dropout):\n",
+    "        super(EnEmbedding, self).__init__()\n",
+    "        # Patching\n",
+    "        self.patch_len = patch_len\n",
+    "\n",
+    "        self.value_embedding = nn.Linear(patch_len, d_model, bias=False)\n",
+    "        self.glb_token = nn.Parameter(torch.randn(1, n_vars, 1, d_model))\n",
+    "        self.position_embedding = PositionalEmbedding(d_model)\n",
+    "\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        # do patching\n",
+    "        n_vars = x.shape[1]\n",
+    "        glb = self.glb_token.repeat((x.shape[0], 1, 1, 1))\n",
+    "\n",
+    "        x = x.unfold(dimension=-1, size=self.patch_len, step=self.patch_len)\n",
+    "        x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3]))\n",
+    "        # Input encoding\n",
+    "        x = self.value_embedding(x) + self.position_embedding(x)\n",
+    "        x = torch.reshape(x, (-1, n_vars, x.shape[-2], x.shape[-1]))\n",
+    "        x = torch.cat([x, glb], dim=2)\n",
+    "        x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3]))\n",
+    "        return self.dropout(x), n_vars"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2. Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "\n",
+    "class TimeXer(BaseMultivariate):\n",
+    "    \"\"\"\n",
+    "    TimeXer\n",
+    "\n",
+    "    **Parameters:**<br>\n",
+    "    `h`: int, Forecast horizon. <br>\n",
+    "    `input_size`: int, autorregresive inputs size, y=[1,2,3,4] input_size=2 -> y_[t-2:t]=[1,2].<br>\n",
+    "    `n_series`: int, number of time-series.<br>\n",
+    "    `futr_exog_list`: str list, future exogenous columns.<br>\n",
+    "    `hist_exog_list`: str list, historic exogenous columns.<br>\n",
+    "    `stat_exog_list`: str list, static exogenous columns.<br>\n",
+    "    `patch_len`: int, length of patches.<br>\n",
+    "    `hidden_size`: int, dimension of the model.<br>\n",
+    "    `n_heads`: int, number of heads.<br>\n",
+    "    `e_layers`: int, number of encoder layers.<br>\n",
+    "    `d_ff`: int, dimension of fully-connected layer.<br>\n",
+    "    `factor`: int, attention factor.<br>\n",
+    "    `dropout`: float, dropout rate.<br>\n",
+    "    `use_norm`: bool, whether to normalize or not.<br>\n",
+    "    `loss`: PyTorch module, instantiated train loss class from [losses collection](https://nixtla.github.io/neuralforecast/losses.pytorch.html).<br>\n",
+    "    `valid_loss`: PyTorch module=`loss`, instantiated valid loss class from [losses collection](https://nixtla.github.io/neuralforecast/losses.pytorch.html).<br>\n",
+    "    `max_steps`: int=1000, maximum number of training steps.<br>\n",
+    "    `learning_rate`: float=1e-3, Learning rate between (0, 1).<br>\n",
+    "    `num_lr_decays`: int=-1, Number of learning rate decays, evenly distributed across max_steps.<br>\n",
+    "    `early_stop_patience_steps`: int=-1, Number of validation iterations before early stopping.<br>\n",
+    "    `val_check_steps`: int=100, Number of training steps between every validation loss check.<br>\n",
+    "    `batch_size`: int=32, number of different series in each batch.<br>\n",
+    "    `step_size`: int=1, step size between each window of temporal data.<br>\n",
+    "    `scaler_type`: str='identity', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).<br>\n",
+    "    `random_seed`: int=1, random_seed for pytorch initializer and numpy generators.<br>\n",
+    "    `drop_last_loader`: bool=False, if True `TimeSeriesDataLoader` drops last non-full batch.<br>\n",
+    "    `alias`: str, optional,  Custom name of the model.<br>\n",
+    "    `optimizer`: Subclass of 'torch.optim.Optimizer', optional, user specified optimizer instead of the default choice (Adam).<br>\n",
+    "    `optimizer_kwargs`: dict, optional, list of parameters used by the user specified `optimizer`.<br>\n",
+    "    `lr_scheduler`: Subclass of 'torch.optim.lr_scheduler.LRScheduler', optional, user specified lr_scheduler instead of the default choice (StepLR).<br>\n",
+    "    `lr_scheduler_kwargs`: dict, optional, list of parameters used by the user specified `lr_scheduler`.<br>\n",
+    "    `dataloader_kwargs`: dict, optional, list of parameters passed into the PyTorch Lightning dataloader by the `TimeSeriesDataLoader`. <br>\n",
+    "    `**trainer_kwargs`: int,  keyword trainer arguments inherited from [PyTorch Lighning's trainer](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html?highlight=trainer).<br>\n",
+    "\n",
+    "    **Parameters:**<br>\n",
+    "\n",
+    "    **References**\n",
+    "    - [Yuxuan Wang, Haixu Wu, Jiaxiang Dong, Guo Qin, Haoran Zhang, Yong Liu, Yunzhong Qiu, Jianmin Wang, Mingsheng Long. \"TimeXer: Empowering Transformers for Time Series Forecasting with Exogenous Variables\"](https://arxiv.org/abs/2402.19072)\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # Class attributes\n",
+    "    SAMPLING_TYPE = 'multivariate'\n",
+    "    EXOGENOUS_FUTR = True\n",
+    "    EXOGENOUS_HIST = False\n",
+    "    EXOGENOUS_STAT = False\n",
+    "\n",
+    "    def __init__(self,\n",
+    "                 h,\n",
+    "                 input_size,\n",
+    "                 n_series,\n",
+    "                 futr_exog_list = None,\n",
+    "                 hist_exog_list = None,\n",
+    "                 stat_exog_list = None,\n",
+    "                 patch_len: int = 16,\n",
+    "                 hidden_size: int = 512,\n",
+    "                 n_heads: int = 8,\n",
+    "                 e_layers: int = 2,\n",
+    "                 d_ff: int = 2048,\n",
+    "                 factor: int = 1,\n",
+    "                 dropout: float = 0.1,\n",
+    "                 use_norm: bool = True,\n",
+    "                 loss = MAE(),\n",
+    "                 valid_loss = None,\n",
+    "                 max_steps: int = 1000,\n",
+    "                 learning_rate: float = 1e-3,\n",
+    "                 num_lr_decays: int = -1,\n",
+    "                 early_stop_patience_steps: int =-1,\n",
+    "                 val_check_steps: int = 100,\n",
+    "                 batch_size: int = 32,\n",
+    "                 step_size: int = 1,\n",
+    "                 scaler_type: str = 'identity',\n",
+    "                 random_seed: int = 1,\n",
+    "                 drop_last_loader: bool = False,\n",
+    "                 optimizer = None,\n",
+    "                 optimizer_kwargs = None,\n",
+    "                 lr_scheduler = None,\n",
+    "                 lr_scheduler_kwargs = None,  \n",
+    "                 dataloader_kwargs = None,          \n",
+    "                 **trainer_kwargs):\n",
+    "        \n",
+    "        super(TimeXer, self).__init__(h=h,\n",
+    "                                      input_size=input_size,\n",
+    "                                      n_series=n_series,\n",
+    "                                      stat_exog_list = stat_exog_list,\n",
+    "                                      futr_exog_list = futr_exog_list,\n",
+    "                                      hist_exog_list = hist_exog_list,\n",
+    "                                      loss=loss,\n",
+    "                                      valid_loss=valid_loss,\n",
+    "                                      max_steps=max_steps,\n",
+    "                                      learning_rate=learning_rate,\n",
+    "                                      num_lr_decays=num_lr_decays,\n",
+    "                                      early_stop_patience_steps=early_stop_patience_steps,\n",
+    "                                      val_check_steps=val_check_steps,\n",
+    "                                      batch_size=batch_size,\n",
+    "                                      step_size=step_size,\n",
+    "                                      scaler_type=scaler_type,\n",
+    "                                      random_seed=random_seed,\n",
+    "                                      drop_last_loader=drop_last_loader,\n",
+    "                                      optimizer=optimizer,\n",
+    "                                      optimizer_kwargs=optimizer_kwargs,\n",
+    "                                      lr_scheduler=lr_scheduler,\n",
+    "                                      lr_scheduler_kwargs=lr_scheduler_kwargs,\n",
+    "                                      dataloader_kwargs=dataloader_kwargs,\n",
+    "                                      **trainer_kwargs)\n",
+    "        \n",
+    "        self.enc_in = n_series\n",
+    "        self.hidden_size = hidden_size\n",
+    "        self.n_heads = n_heads\n",
+    "        self.e_layers = e_layers\n",
+    "        self.d_ff = d_ff\n",
+    "        self.dropout = dropout\n",
+    "        self.factor = factor\n",
+    "        self.patch_len = patch_len\n",
+    "        self.use_norm = use_norm\n",
+    "        self.patch_num = int(input_size // self.patch_len)\n",
+    "\n",
+    "        # Architecture\n",
+    "        self.en_embedding = EnEmbedding(n_series, self.hidden_size, self.patch_len, self.dropout)\n",
+    "        self.ex_embedding = DataEmbedding_inverted(input_size, self.hidden_size, self.dropout)\n",
+    "\n",
+    "        self.encoder = Encoder(\n",
+    "            [\n",
+    "                EncoderLayer(\n",
+    "                    AttentionLayer(\n",
+    "                        FullAttention(False, self.factor, attention_dropout=self.dropout,\n",
+    "                                      output_attention=False),\n",
+    "                        self.hidden_size, self.n_heads),\n",
+    "                    AttentionLayer(\n",
+    "                        FullAttention(False, self.factor, attention_dropout=self.dropout,\n",
+    "                                      output_attention=False),\n",
+    "                        self.hidden_size, self.n_heads),\n",
+    "                    self.hidden_size,\n",
+    "                    self.d_ff,\n",
+    "                    dropout=self.dropout,\n",
+    "                    activation='relu',\n",
+    "                )\n",
+    "                for l in range(self.e_layers)\n",
+    "            ],\n",
+    "            norm_layer=torch.nn.LayerNorm(self.hidden_size)\n",
+    "        )\n",
+    "        self.head_nf = self.hidden_size * (self.patch_num + 1)\n",
+    "        self.head = FlattenHead(self.enc_in, self.head_nf, h,\n",
+    "                                head_dropout=self.dropout)\n",
+    "        \n",
+    "    def forecast(self, x_enc, x_mark_enc):\n",
+    "        if self.use_norm:\n",
+    "            # Normalization from Non-stationary Transformer\n",
+    "            means = x_enc.mean(1, keepdim=True).detach()\n",
+    "            x_enc = x_enc - means\n",
+    "            stdev = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5)\n",
+    "            x_enc /= stdev\n",
+    "\n",
+    "        _, _, N = x_enc.shape\n",
+    "\n",
+    "        en_embed, n_vars = self.en_embedding(x_enc.permute(0, 2, 1))\n",
+    "        ex_embed = self.ex_embedding(x_enc, x_mark_enc)\n",
+    "\n",
+    "        enc_out = self.encoder(en_embed, ex_embed)\n",
+    "        enc_out = torch.reshape(\n",
+    "            enc_out, (-1, n_vars, enc_out.shape[-2], enc_out.shape[-1]))\n",
+    "        # z: [bs x nvars x d_model x patch_num]\n",
+    "        enc_out = enc_out.permute(0, 1, 3, 2)\n",
+    "\n",
+    "        dec_out = self.head(enc_out)  # z: [bs x nvars x target_window]\n",
+    "        dec_out = dec_out.permute(0, 2, 1)\n",
+    "\n",
+    "        if self.use_norm:\n",
+    "            # De-Normalization from Non-stationary Transformer\n",
+    "            dec_out = dec_out * (stdev[:, 0, :].unsqueeze(1).repeat(1, self.h, 1))\n",
+    "            dec_out = dec_out + (means[:, 0, :].unsqueeze(1).repeat(1, self.h, 1))\n",
+    "\n",
+    "        return dec_out\n",
+    "    \n",
+    "    def forward(self, windows_batch):\n",
+    "        insample_y = windows_batch['insample_y']\n",
+    "        futr_exog = windows_batch['futr_exog']\n",
+    "        \n",
+    "        if self.futr_exog_size > 0:\n",
+    "            x_mark_enc = futr_exog[:, :, :self.input_size, :]\n",
+    "            B, V, T, D = x_mark_enc.shape\n",
+    "            x_mark_enc = x_mark_enc.reshape(B, T, V*D)\n",
+    "        else:\n",
+    "            x_mark_enc = None\n",
+    "\n",
+    "        y_pred = self.forecast(insample_y, x_mark_enc)\n",
+    "        y_pred = y_pred[:, -self.h:, :]\n",
+    "        y_pred = self.loss.domain_map(y_pred)\n",
+    "\n",
+    "        if y_pred.ndim == 2:\n",
+    "            return y_pred.unsqueeze(-1)\n",
+    "        else:\n",
+    "            return y_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(TimeXer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(TimeXer.fit, name='TimeXer.fit')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(TimeXer.predict, name='TimeXer.predict')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 3. Usage example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| eval: false\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "from neuralforecast import NeuralForecast\n",
+    "from neuralforecast.models import TimeXer\n",
+    "from neuralforecast.losses.pytorch import MSE\n",
+    "from neuralforecast.utils import AirPassengersPanel, AirPassengersStatic, augment_calendar_df\n",
+    "\n",
+    "AirPassengersPanel, calendar_cols = augment_calendar_df(df=AirPassengersPanel, freq='M')\n",
+    "\n",
+    "Y_train_df = AirPassengersPanel[AirPassengersPanel.ds<AirPassengersPanel['ds'].values[-12]] # 132 train\n",
+    "Y_test_df = AirPassengersPanel[AirPassengersPanel.ds>=AirPassengersPanel['ds'].values[-12]].reset_index(drop=True) # 12 test\n",
+    "\n",
+    "model = TimeXer(h=12,\n",
+    "                input_size=24,\n",
+    "                n_series=2,\n",
+    "                futr_exog_list=[\"trend\", \"month\"],\n",
+    "                patch_len=12,\n",
+    "                hidden_size=128,\n",
+    "                n_heads=2,\n",
+    "                e_layers=2,\n",
+    "                d_ff=4,\n",
+    "                factor=1,\n",
+    "                dropout=0.1,\n",
+    "                use_norm=True,\n",
+    "                loss=MSE(),\n",
+    "                valid_loss=MAE(),\n",
+    "                early_stop_patience_steps=3,\n",
+    "                batch_size=32)\n",
+    "\n",
+    "fcst = NeuralForecast(models=[model], freq='M')\n",
+    "fcst.fit(df=Y_train_df, static_df=AirPassengersStatic, val_size=12)\n",
+    "forecasts = fcst.predict(futr_df=Y_test_df)\n",
+    "\n",
+    "# Plot predictions\n",
+    "fig, ax = plt.subplots(1, 1, figsize = (20, 7))\n",
+    "Y_hat_df = forecasts.reset_index(drop=False).drop(columns=['unique_id','ds'])\n",
+    "plot_df = pd.concat([Y_test_df, Y_hat_df], axis=1)\n",
+    "plot_df = pd.concat([Y_train_df, plot_df])\n",
+    "\n",
+    "plot_df = plot_df[plot_df.unique_id=='Airline1'].drop('unique_id', axis=1)\n",
+    "plt.plot(plot_df['ds'], plot_df['y'], c='black', label='True')\n",
+    "plt.plot(plot_df['ds'], plot_df['TimeXer'], c='blue', label='Forecast')\n",
+    "ax.set_title('AirPassengers Forecast', fontsize=22)\n",
+    "ax.set_ylabel('Monthly Passengers', fontsize=20)\n",
+    "ax.set_xlabel('Year', fontsize=20)\n",
+    "ax.legend(prop={'size': 15})\n",
+    "ax.grid()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "python3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/nbs/models.vanillatransformer.ipynb b/nbs/models.vanillatransformer.ipynb
index 232de7dfa..e4354af1a 100644
--- a/nbs/models.vanillatransformer.ipynb
+++ b/nbs/models.vanillatransformer.ipynb
@@ -55,7 +55,6 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "import math\n",
     "import numpy as np\n",
     "from typing import Optional\n",
     "\n",
@@ -65,7 +64,7 @@
     "from neuralforecast.common._modules import (\n",
     "    TransEncoderLayer, TransEncoder,\n",
     "    TransDecoderLayer, TransDecoder,\n",
-    "    DataEmbedding, AttentionLayer,\n",
+    "    DataEmbedding, AttentionLayer, FullAttention\n",
     ")\n",
     "from neuralforecast.common._base_windows import BaseWindows\n",
     "\n",
@@ -87,64 +86,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 1. Auxiliary Functions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| export\n",
-    "class TriangularCausalMask():\n",
-    "    def __init__(self, B, L, device=\"cpu\"):\n",
-    "        mask_shape = [B, 1, L, L]\n",
-    "        with torch.no_grad():\n",
-    "            self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device)\n",
-    "\n",
-    "    @property\n",
-    "    def mask(self):\n",
-    "        return self._mask\n",
-    "\n",
-    "class FullAttention(nn.Module):\n",
-    "    \"\"\"\n",
-    "    FullAttention\n",
-    "    \"\"\"    \n",
-    "    def __init__(self, mask_flag=True, scale=None, attention_dropout=0.1, output_attention=False):\n",
-    "        super(FullAttention, self).__init__()\n",
-    "        self.scale = scale\n",
-    "        self.mask_flag = mask_flag\n",
-    "        self.output_attention = output_attention\n",
-    "        self.dropout = nn.Dropout(attention_dropout)\n",
-    "\n",
-    "    def forward(self, queries, keys, values, attn_mask):\n",
-    "        B, L, H, E = queries.shape\n",
-    "        _, S, _, D = values.shape\n",
-    "        scale = self.scale or 1. / math.sqrt(E)\n",
-    "\n",
-    "        scores = torch.einsum(\"blhe,bshe->bhls\", queries, keys)\n",
-    "        \n",
-    "        if self.mask_flag:\n",
-    "            if attn_mask is None:\n",
-    "                attn_mask = TriangularCausalMask(B, L, device=queries.device)\n",
-    "\n",
-    "            scores.masked_fill_(attn_mask.mask, -np.inf)\n",
-    "\n",
-    "        A = self.dropout(torch.softmax(scale * scores, dim=-1))\n",
-    "        V = torch.einsum(\"bhls,bshd->blhd\", A, values)\n",
-    "\n",
-    "        if self.output_attention:\n",
-    "            return (V.contiguous(), A)\n",
-    "        else:\n",
-    "            return (V.contiguous(), None)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2. VanillaTransformer"
+    "## 1. VanillaTransformer"
    ]
   },
   {
diff --git a/nbs/sidebar.yml b/nbs/sidebar.yml
index f4d7e3b2d..31ed63346 100644
--- a/nbs/sidebar.yml
+++ b/nbs/sidebar.yml
@@ -50,6 +50,7 @@ website:
           - models.timemixer.ipynb
           - models.timellm.ipynb
           - models.timesnet.ipynb
+          - models.timexer.ipynb
           - models.tsmixer.ipynb
           - models.tsmixerx.ipynb
           - models.vanillatransformer.ipynb
diff --git a/neuralforecast/_modidx.py b/neuralforecast/_modidx.py
index 445266155..ab1fe227b 100644
--- a/neuralforecast/_modidx.py
+++ b/neuralforecast/_modidx.py
@@ -140,6 +140,11 @@
                                                                                      'neuralforecast/auto.py'),
                                      'neuralforecast.auto.AutoTimeMixer.get_default_config': ( 'models.html#autotimemixer.get_default_config',
                                                                                                'neuralforecast/auto.py'),
+                                     'neuralforecast.auto.AutoTimeXer': ('models.html#autotimexer', 'neuralforecast/auto.py'),
+                                     'neuralforecast.auto.AutoTimeXer.__init__': ( 'models.html#autotimexer.__init__',
+                                                                                   'neuralforecast/auto.py'),
+                                     'neuralforecast.auto.AutoTimeXer.get_default_config': ( 'models.html#autotimexer.get_default_config',
+                                                                                             'neuralforecast/auto.py'),
                                      'neuralforecast.auto.AutoTimesNet': ('models.html#autotimesnet', 'neuralforecast/auto.py'),
                                      'neuralforecast.auto.AutoTimesNet.__init__': ( 'models.html#autotimesnet.__init__',
                                                                                     'neuralforecast/auto.py'),
@@ -788,25 +793,7 @@
                                                                                                       'neuralforecast/models/informer.py'),
                                                 'neuralforecast.models.informer.ProbMask.mask': ( 'models.informer.html#probmask.mask',
                                                                                                   'neuralforecast/models/informer.py')},
-            'neuralforecast.models.itransformer': { 'neuralforecast.models.itransformer.DataEmbedding_inverted': ( 'models.itransformer.html#dataembedding_inverted',
-                                                                                                                   'neuralforecast/models/itransformer.py'),
-                                                    'neuralforecast.models.itransformer.DataEmbedding_inverted.__init__': ( 'models.itransformer.html#dataembedding_inverted.__init__',
-                                                                                                                            'neuralforecast/models/itransformer.py'),
-                                                    'neuralforecast.models.itransformer.DataEmbedding_inverted.forward': ( 'models.itransformer.html#dataembedding_inverted.forward',
-                                                                                                                           'neuralforecast/models/itransformer.py'),
-                                                    'neuralforecast.models.itransformer.FullAttention': ( 'models.itransformer.html#fullattention',
-                                                                                                          'neuralforecast/models/itransformer.py'),
-                                                    'neuralforecast.models.itransformer.FullAttention.__init__': ( 'models.itransformer.html#fullattention.__init__',
-                                                                                                                   'neuralforecast/models/itransformer.py'),
-                                                    'neuralforecast.models.itransformer.FullAttention.forward': ( 'models.itransformer.html#fullattention.forward',
-                                                                                                                  'neuralforecast/models/itransformer.py'),
-                                                    'neuralforecast.models.itransformer.TriangularCausalMask': ( 'models.itransformer.html#triangularcausalmask',
-                                                                                                                 'neuralforecast/models/itransformer.py'),
-                                                    'neuralforecast.models.itransformer.TriangularCausalMask.__init__': ( 'models.itransformer.html#triangularcausalmask.__init__',
-                                                                                                                          'neuralforecast/models/itransformer.py'),
-                                                    'neuralforecast.models.itransformer.TriangularCausalMask.mask': ( 'models.itransformer.html#triangularcausalmask.mask',
-                                                                                                                      'neuralforecast/models/itransformer.py'),
-                                                    'neuralforecast.models.itransformer.iTransformer': ( 'models.itransformer.html#itransformer',
+            'neuralforecast.models.itransformer': { 'neuralforecast.models.itransformer.iTransformer': ( 'models.itransformer.html#itransformer',
                                                                                                          'neuralforecast/models/itransformer.py'),
                                                     'neuralforecast.models.itransformer.iTransformer.__init__': ( 'models.itransformer.html#itransformer.__init__',
                                                                                                                   'neuralforecast/models/itransformer.py'),
@@ -1292,6 +1279,38 @@
                                                                                                       'neuralforecast/models/timesnet.py'),
                                                 'neuralforecast.models.timesnet.TimesNet.forward': ( 'models.timesnet.html#timesnet.forward',
                                                                                                      'neuralforecast/models/timesnet.py')},
+            'neuralforecast.models.timexer': { 'neuralforecast.models.timexer.EnEmbedding': ( 'models.timexer.html#enembedding',
+                                                                                              'neuralforecast/models/timexer.py'),
+                                               'neuralforecast.models.timexer.EnEmbedding.__init__': ( 'models.timexer.html#enembedding.__init__',
+                                                                                                       'neuralforecast/models/timexer.py'),
+                                               'neuralforecast.models.timexer.EnEmbedding.forward': ( 'models.timexer.html#enembedding.forward',
+                                                                                                      'neuralforecast/models/timexer.py'),
+                                               'neuralforecast.models.timexer.Encoder': ( 'models.timexer.html#encoder',
+                                                                                          'neuralforecast/models/timexer.py'),
+                                               'neuralforecast.models.timexer.Encoder.__init__': ( 'models.timexer.html#encoder.__init__',
+                                                                                                   'neuralforecast/models/timexer.py'),
+                                               'neuralforecast.models.timexer.Encoder.forward': ( 'models.timexer.html#encoder.forward',
+                                                                                                  'neuralforecast/models/timexer.py'),
+                                               'neuralforecast.models.timexer.EncoderLayer': ( 'models.timexer.html#encoderlayer',
+                                                                                               'neuralforecast/models/timexer.py'),
+                                               'neuralforecast.models.timexer.EncoderLayer.__init__': ( 'models.timexer.html#encoderlayer.__init__',
+                                                                                                        'neuralforecast/models/timexer.py'),
+                                               'neuralforecast.models.timexer.EncoderLayer.forward': ( 'models.timexer.html#encoderlayer.forward',
+                                                                                                       'neuralforecast/models/timexer.py'),
+                                               'neuralforecast.models.timexer.FlattenHead': ( 'models.timexer.html#flattenhead',
+                                                                                              'neuralforecast/models/timexer.py'),
+                                               'neuralforecast.models.timexer.FlattenHead.__init__': ( 'models.timexer.html#flattenhead.__init__',
+                                                                                                       'neuralforecast/models/timexer.py'),
+                                               'neuralforecast.models.timexer.FlattenHead.forward': ( 'models.timexer.html#flattenhead.forward',
+                                                                                                      'neuralforecast/models/timexer.py'),
+                                               'neuralforecast.models.timexer.TimeXer': ( 'models.timexer.html#timexer',
+                                                                                          'neuralforecast/models/timexer.py'),
+                                               'neuralforecast.models.timexer.TimeXer.__init__': ( 'models.timexer.html#timexer.__init__',
+                                                                                                   'neuralforecast/models/timexer.py'),
+                                               'neuralforecast.models.timexer.TimeXer.forecast': ( 'models.timexer.html#timexer.forecast',
+                                                                                                   'neuralforecast/models/timexer.py'),
+                                               'neuralforecast.models.timexer.TimeXer.forward': ( 'models.timexer.html#timexer.forward',
+                                                                                                  'neuralforecast/models/timexer.py')},
             'neuralforecast.models.tsmixer': { 'neuralforecast.models.tsmixer.FeatureMixing': ( 'models.tsmixer.html#featuremixing',
                                                                                                 'neuralforecast/models/tsmixer.py'),
                                                'neuralforecast.models.tsmixer.FeatureMixing.__init__': ( 'models.tsmixer.html#featuremixing.__init__',
@@ -1362,19 +1381,7 @@
                                                                                                             'neuralforecast/models/tsmixerx.py'),
                                                 'neuralforecast.models.tsmixerx.TemporalMixing.forward': ( 'models.tsmixerx.html#temporalmixing.forward',
                                                                                                            'neuralforecast/models/tsmixerx.py')},
-            'neuralforecast.models.vanillatransformer': { 'neuralforecast.models.vanillatransformer.FullAttention': ( 'models.vanillatransformer.html#fullattention',
-                                                                                                                      'neuralforecast/models/vanillatransformer.py'),
-                                                          'neuralforecast.models.vanillatransformer.FullAttention.__init__': ( 'models.vanillatransformer.html#fullattention.__init__',
-                                                                                                                               'neuralforecast/models/vanillatransformer.py'),
-                                                          'neuralforecast.models.vanillatransformer.FullAttention.forward': ( 'models.vanillatransformer.html#fullattention.forward',
-                                                                                                                              'neuralforecast/models/vanillatransformer.py'),
-                                                          'neuralforecast.models.vanillatransformer.TriangularCausalMask': ( 'models.vanillatransformer.html#triangularcausalmask',
-                                                                                                                             'neuralforecast/models/vanillatransformer.py'),
-                                                          'neuralforecast.models.vanillatransformer.TriangularCausalMask.__init__': ( 'models.vanillatransformer.html#triangularcausalmask.__init__',
-                                                                                                                                      'neuralforecast/models/vanillatransformer.py'),
-                                                          'neuralforecast.models.vanillatransformer.TriangularCausalMask.mask': ( 'models.vanillatransformer.html#triangularcausalmask.mask',
-                                                                                                                                  'neuralforecast/models/vanillatransformer.py'),
-                                                          'neuralforecast.models.vanillatransformer.VanillaTransformer': ( 'models.vanillatransformer.html#vanillatransformer',
+            'neuralforecast.models.vanillatransformer': { 'neuralforecast.models.vanillatransformer.VanillaTransformer': ( 'models.vanillatransformer.html#vanillatransformer',
                                                                                                                            'neuralforecast/models/vanillatransformer.py'),
                                                           'neuralforecast.models.vanillatransformer.VanillaTransformer.__init__': ( 'models.vanillatransformer.html#vanillatransformer.__init__',
                                                                                                                                     'neuralforecast/models/vanillatransformer.py'),
diff --git a/neuralforecast/auto.py b/neuralforecast/auto.py
index b3c85892a..1d407bd58 100644
--- a/neuralforecast/auto.py
+++ b/neuralforecast/auto.py
@@ -4,7 +4,7 @@
 __all__ = ['AutoRNN', 'AutoLSTM', 'AutoGRU', 'AutoTCN', 'AutoDeepAR', 'AutoDilatedRNN', 'AutoBiTCN', 'AutoMLP', 'AutoNBEATS',
            'AutoNBEATSx', 'AutoNHITS', 'AutoDLinear', 'AutoNLinear', 'AutoTiDE', 'AutoDeepNPTS', 'AutoKAN', 'AutoTFT',
            'AutoVanillaTransformer', 'AutoInformer', 'AutoAutoformer', 'AutoFEDformer', 'AutoPatchTST',
-           'AutoiTransformer', 'AutoTimesNet', 'AutoStemGNN', 'AutoHINT', 'AutoTSMixer', 'AutoTSMixerx',
+           'AutoiTransformer', 'AutoTimeXer', 'AutoTimesNet', 'AutoStemGNN', 'AutoHINT', 'AutoTSMixer', 'AutoTSMixerx',
            'AutoMLPMultivariate', 'AutoSOFTS', 'AutoTimeMixer', 'AutoRMoK']
 
 # %% ../nbs/models.ipynb 2
@@ -42,6 +42,7 @@
 from .models.patchtst import PatchTST
 from .models.timesnet import TimesNet
 from .models.itransformer import iTransformer
+from .models.timexer import TimeXer
 
 from .models.kan import KAN
 from .models.rmok import RMoK
@@ -1672,7 +1673,92 @@ def get_default_config(cls, h, backend, n_series):
 
         return config
 
-# %% ../nbs/models.ipynb 109
+# %% ../nbs/models.ipynb 108
+class AutoTimeXer(BaseAuto):
+
+    default_config = {
+        "input_size_multiplier": [1, 2, 3, 4, 5],
+        "h": None,
+        "n_series": None,
+        "hidden_size": tune.choice([128, 256, 512]),
+        "n_heads": tune.choice([4, 8]),
+        "learning_rate": tune.loguniform(1e-4, 1e-1),
+        "scaler_type": tune.choice([None, "robust", "standard"]),
+        "max_steps": tune.choice([500, 1000, 2000]),
+        "batch_size": tune.choice([32, 64, 128, 256]),
+        "loss": None,
+        "random_seed": tune.randint(1, 20),
+    }
+
+    def __init__(
+        self,
+        h,
+        n_series,
+        loss=MAE(),
+        valid_loss=None,
+        config=None,
+        search_alg=BasicVariantGenerator(random_state=1),
+        num_samples=10,
+        refit_with_val=False,
+        cpus=cpu_count(),
+        gpus=torch.cuda.device_count(),
+        verbose=False,
+        alias=None,
+        backend="ray",
+        callbacks=None,
+    ):
+
+        # Define search space, input/output sizes
+        if config is None:
+            config = self.get_default_config(h=h, backend=backend, n_series=n_series)
+
+        # Always use n_series from parameters, raise exception with Optuna because we can't enforce it
+        if backend == "ray":
+            config["n_series"] = n_series
+        elif backend == "optuna":
+            mock_trial = MockTrial()
+            if (
+                "n_series" in config(mock_trial)
+                and config(mock_trial)["n_series"] != n_series
+            ) or ("n_series" not in config(mock_trial)):
+                raise Exception(f"config needs 'n_series': {n_series}")
+
+        super(AutoTimeXer, self).__init__(
+            cls_model=TimeXer,
+            h=h,
+            loss=loss,
+            valid_loss=valid_loss,
+            config=config,
+            search_alg=search_alg,
+            num_samples=num_samples,
+            refit_with_val=refit_with_val,
+            cpus=cpus,
+            gpus=gpus,
+            verbose=verbose,
+            alias=alias,
+            backend=backend,
+            callbacks=callbacks,
+        )
+
+    @classmethod
+    def get_default_config(cls, h, backend, n_series):
+        config = cls.default_config.copy()
+        config["input_size"] = tune.choice(
+            [h * x for x in config["input_size_multiplier"]]
+        )
+
+        # Rolling windows with step_size=1 or step_size=h
+        # See `BaseWindows` and `BaseRNN`'s create_windows
+        config["step_size"] = tune.choice([1, h])
+        del config["input_size_multiplier"]
+        if backend == "optuna":
+            # Always use n_series from parameters
+            config["n_series"] = n_series
+            config = cls._ray_config_to_optuna(config)
+
+        return config
+
+# %% ../nbs/models.ipynb 113
 class AutoTimesNet(BaseAuto):
 
     default_config = {
@@ -1740,7 +1826,7 @@ def get_default_config(cls, h, backend, n_series=None):
 
         return config
 
-# %% ../nbs/models.ipynb 114
+# %% ../nbs/models.ipynb 118
 class AutoStemGNN(BaseAuto):
 
     default_config = {
@@ -1825,7 +1911,7 @@ def get_default_config(cls, h, backend, n_series):
 
         return config
 
-# %% ../nbs/models.ipynb 118
+# %% ../nbs/models.ipynb 122
 class AutoHINT(BaseAuto):
 
     def __init__(
@@ -1897,7 +1983,7 @@ def _fit_model(
     def get_default_config(cls, h, backend, n_series=None):
         raise Exception("AutoHINT has no default configuration.")
 
-# %% ../nbs/models.ipynb 123
+# %% ../nbs/models.ipynb 127
 class AutoTSMixer(BaseAuto):
 
     default_config = {
@@ -1983,7 +2069,7 @@ def get_default_config(cls, h, backend, n_series):
 
         return config
 
-# %% ../nbs/models.ipynb 127
+# %% ../nbs/models.ipynb 131
 class AutoTSMixerx(BaseAuto):
 
     default_config = {
@@ -2069,7 +2155,7 @@ def get_default_config(cls, h, backend, n_series):
 
         return config
 
-# %% ../nbs/models.ipynb 131
+# %% ../nbs/models.ipynb 135
 class AutoMLPMultivariate(BaseAuto):
 
     default_config = {
@@ -2154,7 +2240,7 @@ def get_default_config(cls, h, backend, n_series):
 
         return config
 
-# %% ../nbs/models.ipynb 135
+# %% ../nbs/models.ipynb 139
 class AutoSOFTS(BaseAuto):
 
     default_config = {
@@ -2239,7 +2325,7 @@ def get_default_config(cls, h, backend, n_series):
 
         return config
 
-# %% ../nbs/models.ipynb 139
+# %% ../nbs/models.ipynb 143
 class AutoTimeMixer(BaseAuto):
 
     default_config = {
@@ -2325,7 +2411,7 @@ def get_default_config(cls, h, backend, n_series):
 
         return config
 
-# %% ../nbs/models.ipynb 143
+# %% ../nbs/models.ipynb 147
 class AutoRMoK(BaseAuto):
 
     default_config = {
diff --git a/neuralforecast/common/_modules.py b/neuralforecast/common/_modules.py
index d50228b87..9c73ff85c 100644
--- a/neuralforecast/common/_modules.py
+++ b/neuralforecast/common/_modules.py
@@ -2,12 +2,13 @@
 
 # %% auto 0
 __all__ = ['ACTIVATIONS', 'MLP', 'Chomp1d', 'CausalConv1d', 'TemporalConvolutionEncoder', 'TransEncoderLayer', 'TransEncoder',
-           'TransDecoderLayer', 'TransDecoder', 'AttentionLayer', 'PositionalEmbedding', 'TokenEmbedding',
-           'TimeFeatureEmbedding', 'FixedEmbedding', 'TemporalEmbedding', 'DataEmbedding', 'MovingAvg', 'SeriesDecomp',
-           'RevIN']
+           'TransDecoderLayer', 'TransDecoder', 'AttentionLayer', 'TriangularCausalMask', 'FullAttention',
+           'PositionalEmbedding', 'TokenEmbedding', 'TimeFeatureEmbedding', 'FixedEmbedding', 'TemporalEmbedding',
+           'DataEmbedding', 'DataEmbedding_inverted', 'MovingAvg', 'SeriesDecomp', 'RevIN']
 
 # %% ../../nbs/common.modules.ipynb 3
 import math
+import numpy as np
 
 import torch
 import torch.nn as nn
@@ -317,34 +318,95 @@ def forward(self, x, cross, x_mask=None, cross_mask=None):
 
 # %% ../../nbs/common.modules.ipynb 17
 class AttentionLayer(nn.Module):
-    def __init__(self, attention, hidden_size, n_head, d_keys=None, d_values=None):
+    def __init__(self, attention, hidden_size, n_heads, d_keys=None, d_values=None):
         super(AttentionLayer, self).__init__()
 
-        d_keys = d_keys or (hidden_size // n_head)
-        d_values = d_values or (hidden_size // n_head)
+        d_keys = d_keys or (hidden_size // n_heads)
+        d_values = d_values or (hidden_size // n_heads)
 
         self.inner_attention = attention
-        self.query_projection = nn.Linear(hidden_size, d_keys * n_head)
-        self.key_projection = nn.Linear(hidden_size, d_keys * n_head)
-        self.value_projection = nn.Linear(hidden_size, d_values * n_head)
-        self.out_projection = nn.Linear(d_values * n_head, hidden_size)
-        self.n_head = n_head
+        self.query_projection = nn.Linear(hidden_size, d_keys * n_heads)
+        self.key_projection = nn.Linear(hidden_size, d_keys * n_heads)
+        self.value_projection = nn.Linear(hidden_size, d_values * n_heads)
+        self.out_projection = nn.Linear(d_values * n_heads, hidden_size)
+        self.n_heads = n_heads
 
-    def forward(self, queries, keys, values, attn_mask):
+    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
         B, L, _ = queries.shape
         _, S, _ = keys.shape
-        H = self.n_head
+        H = self.n_heads
 
         queries = self.query_projection(queries).view(B, L, H, -1)
         keys = self.key_projection(keys).view(B, S, H, -1)
         values = self.value_projection(values).view(B, S, H, -1)
 
-        out, attn = self.inner_attention(queries, keys, values, attn_mask)
+        out, attn = self.inner_attention(
+            queries=queries,
+            keys=keys,
+            values=values,
+            attn_mask=attn_mask,
+            tau=tau,
+            delta=delta,
+        )
         out = out.view(B, L, -1)
 
         return self.out_projection(out), attn
 
 # %% ../../nbs/common.modules.ipynb 18
+class TriangularCausalMask:
+    """
+    TriangularCausalMask
+    """
+
+    def __init__(self, B, L, device="cpu"):
+        mask_shape = [B, 1, L, L]
+        with torch.no_grad():
+            self._mask = torch.triu(
+                torch.ones(mask_shape, dtype=torch.bool), diagonal=1
+            ).to(device)
+
+    @property
+    def mask(self):
+        return self._mask
+
+
+class FullAttention(nn.Module):
+    def __init__(
+        self,
+        mask_flag=True,
+        factor=5,
+        scale=None,
+        attention_dropout=0.1,
+        output_attention=False,
+    ):
+        super(FullAttention, self).__init__()
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.output_attention = output_attention
+        self.dropout = nn.Dropout(attention_dropout)
+
+    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
+        B, L, H, E = queries.shape
+        _, S, _, D = values.shape
+        scale = self.scale or 1.0 / math.sqrt(E)
+
+        scores = torch.einsum("blhe,bshe->bhls", queries, keys)
+
+        if self.mask_flag:
+            if attn_mask is None:
+                attn_mask = TriangularCausalMask(B, L, device=queries.device)
+
+            scores.masked_fill_(attn_mask.mask, -np.inf)
+
+        A = self.dropout(torch.softmax(scale * scores, dim=-1))
+        V = torch.einsum("bhls,bshd->blhd", A, values)
+
+        if self.output_attention:
+            return V.contiguous(), A
+        else:
+            return V.contiguous(), None
+
+# %% ../../nbs/common.modules.ipynb 19
 class PositionalEmbedding(nn.Module):
     def __init__(self, hidden_size, max_len=5000):
         super(PositionalEmbedding, self).__init__()
@@ -487,7 +549,29 @@ def forward(self, x, x_mark=None):
 
         return self.dropout(x)
 
-# %% ../../nbs/common.modules.ipynb 19
+
+class DataEmbedding_inverted(nn.Module):
+    """
+    DataEmbedding_inverted
+    """
+
+    def __init__(self, c_in, hidden_size, dropout=0.1):
+        super(DataEmbedding_inverted, self).__init__()
+        self.value_embedding = nn.Linear(c_in, hidden_size)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        x = x.permute(0, 2, 1)
+        # x: [Batch Variate Time]
+        if x_mark is None:
+            x = self.value_embedding(x)
+        else:
+            # the potential to take covariates (e.g. timestamps) as tokens
+            x = self.value_embedding(torch.cat([x, x_mark.permute(0, 2, 1)], 1))
+        # x: [Batch Variate hidden_size]
+        return self.dropout(x)
+
+# %% ../../nbs/common.modules.ipynb 20
 class MovingAvg(nn.Module):
     """
     Moving average block to highlight the trend of time series
@@ -522,7 +606,7 @@ def forward(self, x):
         res = x - moving_mean
         return res, moving_mean
 
-# %% ../../nbs/common.modules.ipynb 20
+# %% ../../nbs/common.modules.ipynb 21
 class RevIN(nn.Module):
     """RevIN (Reversible-Instance-Normalization)"""
 
diff --git a/neuralforecast/core.py b/neuralforecast/core.py
index f8b254745..74050c6c4 100644
--- a/neuralforecast/core.py
+++ b/neuralforecast/core.py
@@ -66,6 +66,7 @@
     TimeMixer,
     KAN,
     RMoK,
+    TimeXer,
 )
 from .common._base_auto import BaseAuto, MockTrial
 from .utils import PredictionIntervals, get_prediction_interval_method
@@ -193,6 +194,8 @@ def _insample_times(
     "autokan": KAN,
     "rmok": RMoK,
     "autormok": RMoK,
+    "timexer": TimeXer,
+    "autotimexer": TimeXer,
 }
 
 # %% ../nbs/core.ipynb 8
diff --git a/neuralforecast/models/__init__.py b/neuralforecast/models/__init__.py
index 414689631..6a12e7b4a 100644
--- a/neuralforecast/models/__init__.py
+++ b/neuralforecast/models/__init__.py
@@ -3,6 +3,7 @@
            'TFT', 'VanillaTransformer', 'Informer', 'Autoformer', 'PatchTST', 'FEDformer',
            'StemGNN', 'HINT', 'TimesNet', 'TimeLLM', 'TSMixer', 'TSMixerx', 'MLPMultivariate',
            'iTransformer', 'BiTCN', 'TiDE', 'DeepNPTS', 'SOFTS', 'TimeMixer', 'KAN', 'RMoK',
+           'TimeXer',
            ]
 
 from .rnn import RNN
@@ -38,3 +39,4 @@
 from .timemixer import TimeMixer
 from .kan import KAN
 from .rmok import RMoK
+from .timexer import TimeXer
diff --git a/neuralforecast/models/informer.py b/neuralforecast/models/informer.py
index cb4ff2622..7b2465987 100644
--- a/neuralforecast/models/informer.py
+++ b/neuralforecast/models/informer.py
@@ -149,7 +149,7 @@ def _update_context(self, context_in, V, scores, index, L_Q, attn_mask):
         else:
             return (context_in, None)
 
-    def forward(self, queries, keys, values, attn_mask):
+    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
         B, L_Q, H, D = queries.shape
         _, L_K, _, _ = keys.shape
 
diff --git a/neuralforecast/models/itransformer.py b/neuralforecast/models/itransformer.py
index 121eac2b5..45315d8ab 100644
--- a/neuralforecast/models/itransformer.py
+++ b/neuralforecast/models/itransformer.py
@@ -1,17 +1,13 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/models.itransformer.ipynb.
 
 # %% auto 0
-__all__ = ['TriangularCausalMask', 'FullAttention', 'DataEmbedding_inverted', 'iTransformer']
+__all__ = ['iTransformer']
 
 # %% ../../nbs/models.itransformer.ipynb 6
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-import numpy as np
-
-from math import sqrt
-
 from ..losses.pytorch import MAE
 from ..common._base_multivariate import BaseMultivariate
 
@@ -19,89 +15,11 @@
     TransEncoder,
     TransEncoderLayer,
     AttentionLayer,
+    FullAttention,
+    DataEmbedding_inverted,
 )
 
-# %% ../../nbs/models.itransformer.ipynb 9
-class TriangularCausalMask:
-    """
-    TriangularCausalMask
-    """
-
-    def __init__(self, B, L, device="cpu"):
-        mask_shape = [B, 1, L, L]
-        with torch.no_grad():
-            self._mask = torch.triu(
-                torch.ones(mask_shape, dtype=torch.bool), diagonal=1
-            ).to(device)
-
-    @property
-    def mask(self):
-        return self._mask
-
-
-class FullAttention(nn.Module):
-    """
-    FullAttention
-    """
-
-    def __init__(
-        self,
-        mask_flag=True,
-        factor=5,
-        scale=None,
-        attention_dropout=0.1,
-        output_attention=False,
-    ):
-        super(FullAttention, self).__init__()
-        self.scale = scale
-        self.mask_flag = mask_flag
-        self.output_attention = output_attention
-        self.dropout = nn.Dropout(attention_dropout)
-
-    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
-        B, L, H, E = queries.shape
-        _, S, _, D = values.shape
-        scale = self.scale or 1.0 / sqrt(E)
-
-        scores = torch.einsum("blhe,bshe->bhls", queries, keys)
-
-        if self.mask_flag:
-            if attn_mask is None:
-                attn_mask = TriangularCausalMask(B, L, device=queries.device)
-
-            scores.masked_fill_(attn_mask.mask, -np.inf)
-
-        A = self.dropout(torch.softmax(scale * scores, dim=-1))
-        V = torch.einsum("bhls,bshd->blhd", A, values)
-
-        if self.output_attention:
-            return (V.contiguous(), A)
-        else:
-            return (V.contiguous(), None)
-
-# %% ../../nbs/models.itransformer.ipynb 11
-class DataEmbedding_inverted(nn.Module):
-    """
-    DataEmbedding_inverted
-    """
-
-    def __init__(self, c_in, hidden_size, dropout=0.1):
-        super(DataEmbedding_inverted, self).__init__()
-        self.value_embedding = nn.Linear(c_in, hidden_size)
-        self.dropout = nn.Dropout(p=dropout)
-
-    def forward(self, x, x_mark):
-        x = x.permute(0, 2, 1)
-        # x: [Batch Variate Time]
-        if x_mark is None:
-            x = self.value_embedding(x)
-        else:
-            # the potential to take covariates (e.g. timestamps) as tokens
-            x = self.value_embedding(torch.cat([x, x_mark.permute(0, 2, 1)], 1))
-        # x: [Batch Variate hidden_size]
-        return self.dropout(x)
-
-# %% ../../nbs/models.itransformer.ipynb 13
+# %% ../../nbs/models.itransformer.ipynb 8
 class iTransformer(BaseMultivariate):
     """iTransformer
 
diff --git a/neuralforecast/models/timexer.py b/neuralforecast/models/timexer.py
new file mode 100644
index 000000000..14adf1d15
--- /dev/null
+++ b/neuralforecast/models/timexer.py
@@ -0,0 +1,352 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/models.timexer.ipynb.
+
+# %% auto 0
+__all__ = ['FlattenHead', 'Encoder', 'EncoderLayer', 'EnEmbedding', 'TimeXer']
+
+# %% ../../nbs/models.timexer.ipynb 5
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..losses.pytorch import MAE
+from ..common._base_multivariate import BaseMultivariate
+from neuralforecast.common._modules import (
+    DataEmbedding_inverted,
+    PositionalEmbedding,
+    FullAttention,
+    AttentionLayer,
+)
+
+# %% ../../nbs/models.timexer.ipynb 7
+class FlattenHead(nn.Module):
+    def __init__(self, n_vars, nf, target_window, head_dropout=0):
+        super().__init__()
+        self.n_vars = n_vars
+        self.flatten = nn.Flatten(start_dim=-2)
+        self.linear = nn.Linear(nf, target_window)
+        self.dropout = nn.Dropout(head_dropout)
+
+    def forward(self, x):  # x: [bs x nvars x d_model x patch_num]
+        x = self.flatten(x)
+        x = self.linear(x)
+        x = self.dropout(x)
+        return x
+
+# %% ../../nbs/models.timexer.ipynb 8
+class Encoder(nn.Module):
+    def __init__(self, layers, norm_layer=None, projection=None):
+        super(Encoder, self).__init__()
+        self.layers = nn.ModuleList(layers)
+        self.norm = norm_layer
+        self.projection = projection
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None):
+        for layer in self.layers:
+            x = layer(
+                x, cross, x_mask=x_mask, cross_mask=cross_mask, tau=tau, delta=delta
+            )
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        if self.projection is not None:
+            x = self.projection(x)
+        return x
+
+# %% ../../nbs/models.timexer.ipynb 9
+class EncoderLayer(nn.Module):
+    def __init__(
+        self,
+        self_attention,
+        cross_attention,
+        d_model,
+        d_ff=None,
+        dropout=0.1,
+        activation="relu",
+    ):
+        super(EncoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.self_attention = self_attention
+        self.cross_attention = cross_attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None):
+        B, L, D = cross.shape
+        x = x + self.dropout(
+            self.self_attention(x, x, x, attn_mask=x_mask, tau=tau, delta=None)[0]
+        )
+        x = self.norm1(x)
+
+        x_glb_ori = x[:, -1, :].unsqueeze(1)
+        x_glb = torch.reshape(x_glb_ori, (B, -1, D))
+        x_glb_attn = self.dropout(
+            self.cross_attention(
+                x_glb, cross, cross, attn_mask=cross_mask, tau=tau, delta=delta
+            )[0]
+        )
+        x_glb_attn = torch.reshape(
+            x_glb_attn, (x_glb_attn.shape[0] * x_glb_attn.shape[1], x_glb_attn.shape[2])
+        ).unsqueeze(1)
+        x_glb = x_glb_ori + x_glb_attn
+        x_glb = self.norm2(x_glb)
+
+        y = x = torch.cat([x[:, :-1, :], x_glb], dim=1)
+
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+        return self.norm3(x + y)
+
+# %% ../../nbs/models.timexer.ipynb 10
+class EnEmbedding(nn.Module):
+    def __init__(self, n_vars, d_model, patch_len, dropout):
+        super(EnEmbedding, self).__init__()
+        # Patching
+        self.patch_len = patch_len
+
+        self.value_embedding = nn.Linear(patch_len, d_model, bias=False)
+        self.glb_token = nn.Parameter(torch.randn(1, n_vars, 1, d_model))
+        self.position_embedding = PositionalEmbedding(d_model)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        # do patching
+        n_vars = x.shape[1]
+        glb = self.glb_token.repeat((x.shape[0], 1, 1, 1))
+
+        x = x.unfold(dimension=-1, size=self.patch_len, step=self.patch_len)
+        x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3]))
+        # Input encoding
+        x = self.value_embedding(x) + self.position_embedding(x)
+        x = torch.reshape(x, (-1, n_vars, x.shape[-2], x.shape[-1]))
+        x = torch.cat([x, glb], dim=2)
+        x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3]))
+        return self.dropout(x), n_vars
+
+# %% ../../nbs/models.timexer.ipynb 12
+class TimeXer(BaseMultivariate):
+    """
+    TimeXer
+
+    **Parameters:**<br>
+    `h`: int, Forecast horizon. <br>
+    `input_size`: int, autorregresive inputs size, y=[1,2,3,4] input_size=2 -> y_[t-2:t]=[1,2].<br>
+    `n_series`: int, number of time-series.<br>
+    `futr_exog_list`: str list, future exogenous columns.<br>
+    `hist_exog_list`: str list, historic exogenous columns.<br>
+    `stat_exog_list`: str list, static exogenous columns.<br>
+    `patch_len`: int, length of patches.<br>
+    `hidden_size`: int, dimension of the model.<br>
+    `n_heads`: int, number of heads.<br>
+    `e_layers`: int, number of encoder layers.<br>
+    `d_ff`: int, dimension of fully-connected layer.<br>
+    `factor`: int, attention factor.<br>
+    `dropout`: float, dropout rate.<br>
+    `use_norm`: bool, whether to normalize or not.<br>
+    `loss`: PyTorch module, instantiated train loss class from [losses collection](https://nixtla.github.io/neuralforecast/losses.pytorch.html).<br>
+    `valid_loss`: PyTorch module=`loss`, instantiated valid loss class from [losses collection](https://nixtla.github.io/neuralforecast/losses.pytorch.html).<br>
+    `max_steps`: int=1000, maximum number of training steps.<br>
+    `learning_rate`: float=1e-3, Learning rate between (0, 1).<br>
+    `num_lr_decays`: int=-1, Number of learning rate decays, evenly distributed across max_steps.<br>
+    `early_stop_patience_steps`: int=-1, Number of validation iterations before early stopping.<br>
+    `val_check_steps`: int=100, Number of training steps between every validation loss check.<br>
+    `batch_size`: int=32, number of different series in each batch.<br>
+    `step_size`: int=1, step size between each window of temporal data.<br>
+    `scaler_type`: str='identity', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).<br>
+    `random_seed`: int=1, random_seed for pytorch initializer and numpy generators.<br>
+    `drop_last_loader`: bool=False, if True `TimeSeriesDataLoader` drops last non-full batch.<br>
+    `alias`: str, optional,  Custom name of the model.<br>
+    `optimizer`: Subclass of 'torch.optim.Optimizer', optional, user specified optimizer instead of the default choice (Adam).<br>
+    `optimizer_kwargs`: dict, optional, list of parameters used by the user specified `optimizer`.<br>
+    `lr_scheduler`: Subclass of 'torch.optim.lr_scheduler.LRScheduler', optional, user specified lr_scheduler instead of the default choice (StepLR).<br>
+    `lr_scheduler_kwargs`: dict, optional, list of parameters used by the user specified `lr_scheduler`.<br>
+    `dataloader_kwargs`: dict, optional, list of parameters passed into the PyTorch Lightning dataloader by the `TimeSeriesDataLoader`. <br>
+    `**trainer_kwargs`: int,  keyword trainer arguments inherited from [PyTorch Lighning's trainer](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html?highlight=trainer).<br>
+
+    **Parameters:**<br>
+
+    **References**
+    - [Yuxuan Wang, Haixu Wu, Jiaxiang Dong, Guo Qin, Haoran Zhang, Yong Liu, Yunzhong Qiu, Jianmin Wang, Mingsheng Long. "TimeXer: Empowering Transformers for Time Series Forecasting with Exogenous Variables"](https://arxiv.org/abs/2402.19072)
+    """
+
+    # Class attributes
+    SAMPLING_TYPE = "multivariate"
+    EXOGENOUS_FUTR = True
+    EXOGENOUS_HIST = False
+    EXOGENOUS_STAT = False
+
+    def __init__(
+        self,
+        h,
+        input_size,
+        n_series,
+        futr_exog_list=None,
+        hist_exog_list=None,
+        stat_exog_list=None,
+        patch_len: int = 16,
+        hidden_size: int = 512,
+        n_heads: int = 8,
+        e_layers: int = 2,
+        d_ff: int = 2048,
+        factor: int = 1,
+        dropout: float = 0.1,
+        use_norm: bool = True,
+        loss=MAE(),
+        valid_loss=None,
+        max_steps: int = 1000,
+        learning_rate: float = 1e-3,
+        num_lr_decays: int = -1,
+        early_stop_patience_steps: int = -1,
+        val_check_steps: int = 100,
+        batch_size: int = 32,
+        step_size: int = 1,
+        scaler_type: str = "identity",
+        random_seed: int = 1,
+        drop_last_loader: bool = False,
+        optimizer=None,
+        optimizer_kwargs=None,
+        lr_scheduler=None,
+        lr_scheduler_kwargs=None,
+        dataloader_kwargs=None,
+        **trainer_kwargs
+    ):
+
+        super(TimeXer, self).__init__(
+            h=h,
+            input_size=input_size,
+            n_series=n_series,
+            stat_exog_list=stat_exog_list,
+            futr_exog_list=futr_exog_list,
+            hist_exog_list=hist_exog_list,
+            loss=loss,
+            valid_loss=valid_loss,
+            max_steps=max_steps,
+            learning_rate=learning_rate,
+            num_lr_decays=num_lr_decays,
+            early_stop_patience_steps=early_stop_patience_steps,
+            val_check_steps=val_check_steps,
+            batch_size=batch_size,
+            step_size=step_size,
+            scaler_type=scaler_type,
+            random_seed=random_seed,
+            drop_last_loader=drop_last_loader,
+            optimizer=optimizer,
+            optimizer_kwargs=optimizer_kwargs,
+            lr_scheduler=lr_scheduler,
+            lr_scheduler_kwargs=lr_scheduler_kwargs,
+            dataloader_kwargs=dataloader_kwargs,
+            **trainer_kwargs
+        )
+
+        self.enc_in = n_series
+        self.hidden_size = hidden_size
+        self.n_heads = n_heads
+        self.e_layers = e_layers
+        self.d_ff = d_ff
+        self.dropout = dropout
+        self.factor = factor
+        self.patch_len = patch_len
+        self.use_norm = use_norm
+        self.patch_num = int(input_size // self.patch_len)
+
+        # Architecture
+        self.en_embedding = EnEmbedding(
+            n_series, self.hidden_size, self.patch_len, self.dropout
+        )
+        self.ex_embedding = DataEmbedding_inverted(
+            input_size, self.hidden_size, self.dropout
+        )
+
+        self.encoder = Encoder(
+            [
+                EncoderLayer(
+                    AttentionLayer(
+                        FullAttention(
+                            False,
+                            self.factor,
+                            attention_dropout=self.dropout,
+                            output_attention=False,
+                        ),
+                        self.hidden_size,
+                        self.n_heads,
+                    ),
+                    AttentionLayer(
+                        FullAttention(
+                            False,
+                            self.factor,
+                            attention_dropout=self.dropout,
+                            output_attention=False,
+                        ),
+                        self.hidden_size,
+                        self.n_heads,
+                    ),
+                    self.hidden_size,
+                    self.d_ff,
+                    dropout=self.dropout,
+                    activation="relu",
+                )
+                for l in range(self.e_layers)
+            ],
+            norm_layer=torch.nn.LayerNorm(self.hidden_size),
+        )
+        self.head_nf = self.hidden_size * (self.patch_num + 1)
+        self.head = FlattenHead(self.enc_in, self.head_nf, h, head_dropout=self.dropout)
+
+    def forecast(self, x_enc, x_mark_enc):
+        if self.use_norm:
+            # Normalization from Non-stationary Transformer
+            means = x_enc.mean(1, keepdim=True).detach()
+            x_enc = x_enc - means
+            stdev = torch.sqrt(
+                torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5
+            )
+            x_enc /= stdev
+
+        _, _, N = x_enc.shape
+
+        en_embed, n_vars = self.en_embedding(x_enc.permute(0, 2, 1))
+        ex_embed = self.ex_embedding(x_enc, x_mark_enc)
+
+        enc_out = self.encoder(en_embed, ex_embed)
+        enc_out = torch.reshape(
+            enc_out, (-1, n_vars, enc_out.shape[-2], enc_out.shape[-1])
+        )
+        # z: [bs x nvars x d_model x patch_num]
+        enc_out = enc_out.permute(0, 1, 3, 2)
+
+        dec_out = self.head(enc_out)  # z: [bs x nvars x target_window]
+        dec_out = dec_out.permute(0, 2, 1)
+
+        if self.use_norm:
+            # De-Normalization from Non-stationary Transformer
+            dec_out = dec_out * (stdev[:, 0, :].unsqueeze(1).repeat(1, self.h, 1))
+            dec_out = dec_out + (means[:, 0, :].unsqueeze(1).repeat(1, self.h, 1))
+
+        return dec_out
+
+    def forward(self, windows_batch):
+        insample_y = windows_batch["insample_y"]
+        futr_exog = windows_batch["futr_exog"]
+
+        if self.futr_exog_size > 0:
+            x_mark_enc = futr_exog[:, :, : self.input_size, :]
+            B, V, T, D = x_mark_enc.shape
+            x_mark_enc = x_mark_enc.reshape(B, T, V * D)
+        else:
+            x_mark_enc = None
+
+        y_pred = self.forecast(insample_y, x_mark_enc)
+        y_pred = y_pred[:, -self.h :, :]
+        y_pred = self.loss.domain_map(y_pred)
+
+        if y_pred.ndim == 2:
+            return y_pred.unsqueeze(-1)
+        else:
+            return y_pred
diff --git a/neuralforecast/models/vanillatransformer.py b/neuralforecast/models/vanillatransformer.py
index e38c03fc9..a8276d577 100644
--- a/neuralforecast/models/vanillatransformer.py
+++ b/neuralforecast/models/vanillatransformer.py
@@ -1,10 +1,9 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/models.vanillatransformer.ipynb.
 
 # %% auto 0
-__all__ = ['TriangularCausalMask', 'FullAttention', 'VanillaTransformer']
+__all__ = ['VanillaTransformer']
 
 # %% ../../nbs/models.vanillatransformer.ipynb 5
-import math
 import numpy as np
 from typing import Optional
 
@@ -18,61 +17,13 @@
     TransDecoder,
     DataEmbedding,
     AttentionLayer,
+    FullAttention,
 )
 from ..common._base_windows import BaseWindows
 
 from ..losses.pytorch import MAE
 
 # %% ../../nbs/models.vanillatransformer.ipynb 8
-class TriangularCausalMask:
-    def __init__(self, B, L, device="cpu"):
-        mask_shape = [B, 1, L, L]
-        with torch.no_grad():
-            self._mask = torch.triu(
-                torch.ones(mask_shape, dtype=torch.bool), diagonal=1
-            ).to(device)
-
-    @property
-    def mask(self):
-        return self._mask
-
-
-class FullAttention(nn.Module):
-    """
-    FullAttention
-    """
-
-    def __init__(
-        self, mask_flag=True, scale=None, attention_dropout=0.1, output_attention=False
-    ):
-        super(FullAttention, self).__init__()
-        self.scale = scale
-        self.mask_flag = mask_flag
-        self.output_attention = output_attention
-        self.dropout = nn.Dropout(attention_dropout)
-
-    def forward(self, queries, keys, values, attn_mask):
-        B, L, H, E = queries.shape
-        _, S, _, D = values.shape
-        scale = self.scale or 1.0 / math.sqrt(E)
-
-        scores = torch.einsum("blhe,bshe->bhls", queries, keys)
-
-        if self.mask_flag:
-            if attn_mask is None:
-                attn_mask = TriangularCausalMask(B, L, device=queries.device)
-
-            scores.masked_fill_(attn_mask.mask, -np.inf)
-
-        A = self.dropout(torch.softmax(scale * scores, dim=-1))
-        V = torch.einsum("bhls,bshd->blhd", A, values)
-
-        if self.output_attention:
-            return (V.contiguous(), A)
-        else:
-            return (V.contiguous(), None)
-
-# %% ../../nbs/models.vanillatransformer.ipynb 10
 class VanillaTransformer(BaseWindows):
     """VanillaTransformer