ecmwf
diff --git a/‎.github/workflows/integration-tests-hpc.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/integration-tests-hpc.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎models/pytest.ini‎
Lines changed: 1 addition & 0 deletions b/‎models/pytest.ini‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎models/src/anemoi/models/layers/block.py‎
Lines changed: 50 additions & 13 deletions b/‎models/src/anemoi/models/layers/block.py‎
Lines changed: 50 additions & 13 deletions
diff --git a/‎models/src/anemoi/models/layers/mapper.py‎
Lines changed: 12 additions & 0 deletions b/‎models/src/anemoi/models/layers/mapper.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎models/src/anemoi/models/layers/processor.py‎
Lines changed: 4 additions & 0 deletions b/‎models/src/anemoi/models/layers/processor.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎models/src/anemoi/models/schemas/decoder.py‎
Lines changed: 1 addition & 1 deletion b/‎models/src/anemoi/models/schemas/decoder.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎models/src/anemoi/models/schemas/encoder.py‎
Lines changed: 1 addition & 1 deletion b/‎models/src/anemoi/models/schemas/encoder.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎models/src/anemoi/models/schemas/processor.py‎
Lines changed: 15 additions & 0 deletions b/‎models/src/anemoi/models/schemas/processor.py‎
Lines changed: 15 additions & 0 deletions
@@ -43,6 +43,7 @@ jobs:
             pip install --upgrade pip
             pip install -e ./training[all,tests] -e ./models[all,tests] -e ./graphs[all,tests]
             python3 -m pytest -v training/tests/integration --slow
+            python3 -m pytest -v models/tests/integration --slow
             deactivate
             rm -rf $REPO_NAME
           sbatch_options: |
 
@@ -3,5 +3,6 @@ markers =
     data_dependent: marks tests depending on data (deselect with '-m "not data_dependent"')
     auth: marks tests that require authentication (deselect with '-m "not auth"')
     gpu: marks tests that require a GPU (deselect with '-m "not gpu"')
+    slow: mark test as slow (skipped unless --slow is used)
 
 tmp_path_retention_policy = none
@@ -34,6 +34,8 @@
 from anemoi.models.layers.conv import GraphConv
 from anemoi.models.layers.conv import GraphTransformerConv
 from anemoi.models.layers.mlp import MLP
+from anemoi.models.triton.gt import GraphTransformerFunction
+from anemoi.models.triton.utils import edge_index_to_csc
 from anemoi.utils.config import DotDict
 
 LOGGER = logging.getLogger(__name__)
@@ -443,6 +445,7 @@ def __init__(
         qk_norm: bool = False,
         update_src_nodes: bool = False,
         layer_kernels: DotDict,
+        graph_attention_backend: str = "triton",
         **kwargs,
     ) -> None:
         """Initialize GraphTransformerBlock.
@@ -466,6 +469,8 @@ def __init__(
         layer_kernels : DotDict
             A dict of layer implementations e.g. layer_kernels.Linear = "torch.nn.Linear"
             Defined in config/models/<model>.yaml
+        graph_attention_backend: str, by default "triton"
+            Backend to use for graph transformer conv, options are "triton" and "pyg"
         """
         super().__init__(**kwargs)
 
@@ -483,8 +488,6 @@ def __init__(
         self.lin_self = Linear(in_channels, num_heads * self.out_channels_conv, bias=bias)
         self.lin_edge = Linear(edge_dim, num_heads * self.out_channels_conv)  # , bias=False)
 
-        self.conv = GraphTransformerConv(out_channels=self.out_channels_conv)
-
         self.projection = Linear(out_channels, out_channels)
 
         if self.qk_norm:
@@ -499,6 +502,19 @@ def __init__(
             Linear(hidden_dim, out_channels),
         )
 
+        self.graph_attention_backend = graph_attention_backend
+        assert self.graph_attention_backend in [
+            "triton",
+            "pyg",
+        ], f"Backend {self.graph_attention_backend} not supported for GraphTransformerBlock, valid options are 'triton' and 'pyg'"
+
+        if self.graph_attention_backend == "triton":
+            LOGGER.info(f"{self.__class__.__name__} using triton graph attention backend.")
+            self.conv = GraphTransformerFunction.apply
+        else:
+            LOGGER.warning(f"{self.__class__.__name__} using pyg graph attention backend, consider using 'triton'.")
+            self.conv = GraphTransformerConv(out_channels=self.out_channels_conv)
+
     def run_node_dst_mlp(self, x, **layer_kwargs):
         return self.node_dst_mlp(self.layer_norm_mlp_dst(x, **layer_kwargs))
 
@@ -555,37 +571,50 @@ def shard_qkve_heads(
 
         return query, key, value, edges
 
-    def attention_block(
+    def apply_gt(
         self,
         query: Tensor,
         key: Tensor,
         value: Tensor,
         edges: Tensor,
         edge_index: Adj,
         size: Union[int, tuple[int, int]],
-        num_chunks: int,
     ) -> Tensor:
         # self.conv requires size to be a tuple
         conv_size = (size, size) if isinstance(size, int) else size
 
+        if self.graph_attention_backend == "triton":
+            csc, perm, reverse = edge_index_to_csc(edge_index, num_nodes=conv_size, reverse=True)
+            edges_csc = edges.index_select(0, perm)
+            args_conv = (edges_csc, csc, reverse)
+        else:
+            args_conv = (edges, edge_index, conv_size)
+
+        return self.conv(query, key, value, *args_conv)
+
+    def attention_block(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        edges: Tensor,
+        edge_index: Adj,
+        size: Union[int, tuple[int, int]],
+        num_chunks: int,
+    ) -> Tensor:
+        # split 1-hop edges into chunks, compute self.conv chunk-wise
         if num_chunks > 1:
-            # split 1-hop edges into chunks, compute self.conv chunk-wise
             edge_attr_list, edge_index_list = sort_edges_1hop_chunks(
                 num_nodes=size, edge_attr=edges, edge_index=edge_index, num_chunks=num_chunks
             )
             # shape: (num_nodes, num_heads, out_channels_conv)
             out = torch.zeros((*query.shape[:-1], self.out_channels_conv), device=query.device)
             for i in range(num_chunks):
-                out += self.conv(
-                    query=query,
-                    key=key,
-                    value=value,
-                    edge_attr=edge_attr_list[i],
-                    edge_index=edge_index_list[i],
-                    size=conv_size,
+                out += self.apply_gt(
+                    query=query, key=key, value=value, edges=edge_attr_list[i], edge_index=edge_index_list[i], size=size
                 )
         else:
-            out = self.conv(query=query, key=key, value=value, edge_attr=edges, edge_index=edge_index, size=conv_size)
+            out = self.apply_gt(query=query, key=key, value=value, edges=edges, edge_index=edge_index, size=size)
 
         return out
 
@@ -635,6 +664,7 @@ def __init__(
         update_src_nodes: bool = False,
         layer_kernels: DotDict,
         shard_strategy: str = "edges",
+        graph_attention_backend: str = "triton",
         **kwargs,
     ) -> None:
         """Initialize GraphTransformerBlock.
@@ -662,6 +692,8 @@ def __init__(
             Defined in config/models/<model>.yaml
         shard_strategy: str, by default "edges"
             Strategy to shard tensors
+        graph_attention_backend: str, by default "triton"
+            Backend to use for graph transformer conv, options are "triton" and "pyg"
         """
 
         super().__init__(
@@ -674,6 +706,7 @@ def __init__(
             bias=bias,
             qk_norm=qk_norm,
             update_src_nodes=update_src_nodes,
+            graph_attention_backend=graph_attention_backend,
             **kwargs,
         )
 
@@ -791,6 +824,7 @@ def __init__(
         qk_norm: bool = False,
         update_src_nodes: bool = False,
         layer_kernels: DotDict,
+        graph_attention_backend: str = "triton",
         **kwargs,
     ) -> None:
         """Initialize GraphTransformerBlock.
@@ -814,6 +848,8 @@ def __init__(
         layer_kernels : DotDict
             A dict of layer implementations e.g. layer_kernels.Linear = "torch.nn.Linear"
             Defined in config/models/<model>.yaml
+        graph_attention_backend: str, by default "triton"
+            Backend to use for graph transformer conv, options are "triton" and "pyg"
         """
 
         super().__init__(
@@ -826,6 +862,7 @@ def __init__(
             bias=bias,
             qk_norm=qk_norm,
             update_src_nodes=update_src_nodes,
+            graph_attention_backend=graph_attention_backend,
             **kwargs,
         )
 
 
@@ -216,6 +216,7 @@ def __init__(
         cpu_offload: bool = False,
         layer_kernels: DotDict = None,
         shard_strategy: str = "edges",
+        graph_attention_backend: str = "triton",
     ) -> None:
         """Initialize GraphTransformerBaseMapper.
 
@@ -254,6 +255,8 @@ def __init__(
             Defined in config/models/<model>.yaml
         shard_strategy : str, optional
             Strategy to shard tensors, by default "edges"
+        graph_attention_backend: str, by default "triton"
+            Backend to use for graph transformer conv, options are "triton" and "pyg"
         """
         super().__init__(
             in_channels_src=in_channels_src,
@@ -282,6 +285,7 @@ def __init__(
             qk_norm=qk_norm,
             layer_kernels=self.layer_factory,
             shard_strategy=shard_strategy,
+            graph_attention_backend=graph_attention_backend,
         )
 
         self.offload_layers(cpu_offload)
@@ -539,6 +543,7 @@ def __init__(
         cpu_offload: bool = False,
         layer_kernels: DotDict = None,
         shard_strategy: str = "edges",
+        graph_attention_backend: str = "triton",
     ) -> None:
         """Initialize GraphTransformerForwardMapper.
 
@@ -574,6 +579,8 @@ def __init__(
             A dict of layer implementations e.g. layer_kernels.Linear = "torch.nn.Linear"
         shard_strategy : str, optional
             Strategy to shard tensors, by default "edges"
+        graph_attention_backend: str, by default "triton"
+            Backend to use for graph transformer conv, options are "triton" and "pyg"
         """
         super().__init__(
             in_channels_src=in_channels_src,
@@ -592,6 +599,7 @@ def __init__(
             dst_grid_size=dst_grid_size,
             layer_kernels=layer_kernels,
             shard_strategy=shard_strategy,
+            graph_attention_backend=graph_attention_backend,
         )
 
         self.emb_nodes_src = self.layer_factory.Linear(self.in_channels_src, self.hidden_dim)
@@ -643,6 +651,7 @@ def __init__(
         cpu_offload: bool = False,
         layer_kernels: DotDict = None,
         shard_strategy: str = "edges",
+        graph_attention_backend: str = "triton",
     ) -> None:
         """Initialize GraphTransformerBackwardMapper.
 
@@ -683,6 +692,8 @@ def __init__(
             Defined in config/models/<model>.yaml
         shard_strategy : str, optional
             Strategy to shard tensors, by default "edges"
+        graph_attention_backend: str, by default "triton"
+            Backend to use for graph transformer conv, options are "triton" and "pyg"
         """
         super().__init__(
             in_channels_src=in_channels_src,
@@ -701,6 +712,7 @@ def __init__(
             dst_grid_size=dst_grid_size,
             layer_kernels=layer_kernels,
             shard_strategy=shard_strategy,
+            graph_attention_backend=graph_attention_backend,
         )
 
         self.node_data_extractor = nn.Sequential(
 
@@ -408,6 +408,7 @@ def __init__(
         qk_norm: bool = False,
         cpu_offload: bool = False,
         layer_kernels: DotDict,
+        graph_attention_backend: str = "triton",
         **kwargs,
     ) -> None:
         """Initialize GraphTransformerProcessor.
@@ -441,6 +442,8 @@ def __init__(
         layer_kernels : DotDict
             A dict of layer implementations e.g. layer_kernels.Linear = "torch.nn.Linear"
             Defined in config/models/<model>.yaml
+        graph_attention_backend: str, by default "triton"
+            Backend to use for graph transformer conv, options are "triton" and "pyg"
         """
         super().__init__(
             num_channels=num_channels,
@@ -465,6 +468,7 @@ def __init__(
             edge_dim=self.edge_dim,
             layer_kernels=self.layer_factory,
             qk_norm=qk_norm,
+            graph_attention_backend=graph_attention_backend,
         )
 
         self.offload_layers(cpu_offload)
 
@@ -40,7 +40,7 @@ class GraphTransformerDecoderSchema(TransformerModelComponent):
     @model_validator(mode="after")
     def check_valid_extras(self) -> Any:
         # This is a check to allow backwards compatibilty of the configs, as the extra fields are not required.
-        allowed_extras = {"shard_strategy": str}
+        allowed_extras = {"shard_strategy": str, "graph_attention_backend": str}
         extras = getattr(self, "__pydantic_extra__", {}) or {}
         for extra_field, value in extras.items():
             if extra_field not in allowed_extras:
 
@@ -38,7 +38,7 @@ class GraphTransformerEncoderSchema(TransformerModelComponent):
     @model_validator(mode="after")
     def check_valid_extras(self) -> Any:
         # This is a check to allow backwards compatibilty of the configs, as the extra fields are not required.
-        allowed_extras = {"shard_strategy": str}
+        allowed_extras = {"shard_strategy": str, "graph_attention_backend": str}
         extras = getattr(self, "__pydantic_extra__", {}) or {}
         for extra_field, value in extras.items():
             if extra_field not in allowed_extras:
 
@@ -43,6 +43,21 @@ class GraphTransformerProcessorSchema(TransformerModelComponent):
     qk_norm: bool = Field(example=False)
     "Normalize the query and key vectors. Default to False."
 
+    @model_validator(mode="after")
+    def check_valid_extras(self) -> Any:
+        # This is a check to allow backwards compatibilty of the configs, as the extra fields are not required.
+        allowed_extras = {"graph_attention_backend": str}
+        extras = getattr(self, "__pydantic_extra__", {}) or {}
+        for extra_field, value in extras.items():
+            if extra_field not in allowed_extras:
+                msg = f"Extra field '{extra_field}' is not allowed. Allowed fields are: {list(allowed_extras.keys())}."
+                raise ValueError(msg)
+            if not isinstance(value, allowed_extras[extra_field]):
+                msg = f"Extra field '{extra_field}' must be of type {allowed_extras[extra_field].__name__}."
+                raise TypeError(msg)
+
+        return self
+
 
 class TransformerProcessorSchema(TransformerModelComponent):
     target_: Literal["anemoi.models.layers.processor.TransformerProcessor"] = Field(..., alias="_target_")