GEOS-ESM · FlorianDeconinck · Jan 6, 2026 · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025
@@ -168,7 +168,7 @@ def des_tables(
     table3: FloatField,
     table4: FloatField,
 ):
-    with computation(FORWARD), interval(...):
+    with computation(FORWARD), interval(0, -1):
         des1 = max(0.0, table1[0, 0, 1] - table1)
         des2 = max(0.0, table2[0, 0, 1] - table2)
         des3 = max(0.0, table3[0, 0, 1] - table3)

@@ -51,8 +51,6 @@ def calculate_derived_states(
     """
     Computes derived state fields required for the rest of the GFDL single moment
     microphysics module.
-
-    This stencil MUST be built using Z_INTERFACE_DIM to function properly.
     """
     from __externals__ import k_end
 
@@ -377,7 +375,6 @@ def __call__(
             dsnow_dt=dsnowdt_macro,
             dgraupel_dt=dgraupeldt_macro,
         )
-
         self._calculate_derived_states(
             p_interface=p_interface,
             p_interface_mb=local_p_interface_mb,

@@ -1,11 +1,13 @@
 from __future__ import annotations
 
+from functools import lru_cache
 from math import prod
 from types import ModuleType
-from typing import List, Optional, Tuple, TypeAlias
+from typing import Tuple, TypeAlias
 
 import cffi
 import numpy as np
+import numpy.lib as npl
 import numpy.typing as npt
 from _cffi_backend import _CDataBase as CFFIObj
 
@@ -34,9 +36,19 @@ def __exit__(self, exc_type, exc_value, traceback):
         pass
 
 
+@lru_cache
+def _compute_fortran_strides(shape: Tuple[int, ...], dtype: npt.DTypeLike) -> tuple[int, ...]:
+    cshp = np.cumprod(shape)
+    return np.concatenate(([1], cshp[:-1])) * np.dtype(dtype).itemsize
+
+
 class FortranPythonConversion:
     """
-    Convert Fortran arrays to NumPy and vice-versa
+    Convert Fortran arrays to NumPy and vice-versa.
+
+    The python memory will be laid out Fortran-like,
+        e.g. column-order,
+        e.g. 3D strides will be [1, dim0, dim0*dim1].
     """
 
     def __init__(
@@ -81,136 +93,97 @@ def device_sync(self):
     def _fortran_to_numpy(
         self,
         fptr: cffi.FFI.CData,
-        dim: Optional[List[int]] = None,
+        dims: list[int] | None,
     ) -> np.ndarray:
         """
         Input: Fortran data pointed to by fptr and of shape dim = (i, j, k)
-        Output: C-ordered double precision NumPy data of shape (i, j, k)
+        Output: F-ordered NumPy data of shape (i, j, k) - strides are the same as input
         """
-        if not dim:
-            dim = [self._npx, self._npy, self._npz]
+        if not dims:
+            dims = [self._npx, self._npy, self._npz]
         ftype = self._ffi.getctype(self._ffi.typeof(fptr).item)
         if ftype not in self._TYPEMAP:
             raise ValueError(f"Fortran Python memory converter: cannot convert type {ftype}")
         return np.frombuffer(
-            self._ffi.buffer(fptr, prod(dim) * self._ffi.sizeof(ftype)),
+            self._ffi.buffer(fptr, prod(dims) * self._ffi.sizeof(ftype)),
             self._TYPEMAP[ftype],
         )
 
     def _upload_and_transform(
         self,
         host_array: np.ndarray,
-        dim: Optional[List[int]] = None,
-        swap_axes: Optional[Tuple[int, int]] = None,
+        dims: list[int] | None,
     ) -> DeviceArray:
         """Upload to device & transform to Pace compatible layout"""
-        with self._current_stream:
-            device_array = cp.asarray(host_array)
-            final_array = self._transform_from_fortran_layout(
-                device_array,
-                dim,
-                swap_axes,
-            )
-            self._current_stream = (
-                self._stream_A if self._current_stream == self._stream_B else self._stream_B
-            )
-            return final_array
+        device_array = cp.asarray(host_array)
+        final_array = self._transform_from_fortran_layout(device_array, dims)
+        self._current_stream = self._stream_A if self._current_stream == self._stream_B else self._stream_B
+        return final_array
 
     def _transform_from_fortran_layout(
         self,
         array: PythonArray,
-        dim: Optional[List[int]] = None,
-        swap_axes: Optional[Tuple[int, int]] = None,
+        dims: list[int] | None,
     ) -> PythonArray:
         """Transform from Fortran layout into a Pace compatible layout"""
-        if not dim:
-            dim = [self._npx, self._npy, self._npz]
-        trf_array = array.reshape(tuple(reversed(dim))).transpose().astype(Float)
-        if swap_axes:
-            trf_array = self._target_np.swapaxes(
-                trf_array,
-                swap_axes[0],
-                swap_axes[1],
-            )
+        if not dims:
+            dims = [self._npx, self._npy, self._npz]
+        trf_array = npl.stride_tricks.as_strided(
+            array,
+            shape=dims,
+            strides=_compute_fortran_strides(tuple(dims), Float),
+        )
         return trf_array
 
     def fortran_to_python(
         self,
         fptr: cffi.FFI.CData,
-        dim: Optional[List[int]] = None,
-        swap_axes: Optional[Tuple[int, int]] = None,
+        dims: list[int] | None,
+        *,
+        allow_device_transfer: bool = True,
     ) -> PythonArray:
-        """Move fortran memory into python space"""
-        np_array = self._fortran_to_numpy(fptr, dim)
-        if self._python_targets_gpu:
-            return self._upload_and_transform(np_array, dim, swap_axes)
+        """Move fortran memory into python space."""
+        np_array = self._fortran_to_numpy(fptr, dims)
+        if allow_device_transfer and self._python_targets_gpu:
+            return self._upload_and_transform(np_array, dims)
         else:
             return self._transform_from_fortran_layout(
                 np_array,
-                dim,
-                swap_axes,
+                dims,
             )
 
-    def _transform_and_download(
-        self,
-        device_array: DeviceArray,
-        dtype: type,
-        swap_axes: Optional[Tuple[int, int]] = None,
-    ) -> np.ndarray:
-        with self._current_stream:
-            if swap_axes:
-                device_array = cp.swapaxes(
-                    device_array,
-                    swap_axes[0],
-                    swap_axes[1],
-                )
-            host_array = cp.asnumpy(
-                device_array.astype(dtype).flatten(order="F"),
-            )
-            self._current_stream = (
-                self._stream_A if self._current_stream == self._stream_B else self._stream_B
-            )
-            return host_array
+    def _transform_and_download(self, device_array: DeviceArray, dtype: type) -> np.ndarray:
+        host_array = cp.asnumpy(device_array.astype(dtype).flatten(order="F"))
+        self._current_stream = self._stream_A if self._current_stream == self._stream_B else self._stream_B
+        return host_array
 
     def _transform_from_python_layout(
         self,
         array: PythonArray,
         dtype: type,
-        swap_axes: Optional[Tuple[int, int]] = None,
     ) -> np.ndarray:
         """Copy back a numpy array in python layout to Fortran"""
 
         if self._python_targets_gpu:
-            numpy_array = self._transform_and_download(array, dtype, swap_axes)
+            numpy_array = self._transform_and_download(array, dtype)
         else:
             numpy_array = array.astype(dtype).flatten(order="F")
-            if swap_axes:
-                numpy_array = np.swapaxes(
-                    numpy_array,
-                    swap_axes[0],
-                    swap_axes[1],
-                )
         return numpy_array
 
     def python_to_fortran(
         self,
         array: PythonArray,
         fptr: cffi.FFI.CData,
         ptr_offset: int = 0,
-        swap_axes: Optional[Tuple[int, int]] = None,
     ) -> None:
         """
         Input: Fortran data pointed to by fptr and of shape dim = (i, j, k)
-        Output: C-ordered double precision NumPy data of shape (i, j, k)
+        Output: F-ordered NumPy data of shape (i, j, k)
         """
         ftype = self._ffi.getctype(self._ffi.typeof(fptr).item)
         assert ftype in self._TYPEMAP
         dtype = self._TYPEMAP[ftype]
-        numpy_array = self._transform_from_python_layout(
-            array,
-            dtype,
-            swap_axes,
-        )
+        numpy_array = self._transform_from_python_layout(array, dtype)
         self._ffi.memmove(fptr + ptr_offset, numpy_array, 4 * numpy_array.size)
 
     def cast(self, dtype: npt.DTypeLike, void_ptr: CFFIObj) -> CFFIObj:

@@ -4,7 +4,6 @@
 
 from ndsl import QuantityFactory, StencilFactory
 from ndsl.constants import X_DIM, Y_DIM, Z_INTERFACE_DIM
-from ndsl.dsl.gt4py_utils import is_gpu_backend
 from pyMoist.GFDL_1M import GFDL1M, GFDL1MConfig, GFDL1MState
 from pyMoist.interface import (
     InterfaceTransferType,
@@ -16,16 +15,15 @@
 
 
 class GFDL1MInterface:
-    def __init__(self, quantity_factory: QuantityFactory, stencil_factory: StencilFactory) -> None:
+    def __init__(
+        self,
+        quantity_factory: QuantityFactory,
+        stencil_factory: StencilFactory,
+        transfer_type: InterfaceTransferType,
+    ) -> None:
         self._stencil_factory = stencil_factory
         self._quantity_factory = quantity_factory
         self._gfdl_1m: GFDL1M | None = None
-
-        if is_gpu_backend(quantity_factory.backend):
-            transfer_type = InterfaceTransferType.CPU_TO_GPU_TO_CPU
-        else:
-            transfer_type = InterfaceTransferType.ALL_CPU
-
         self._managed_state = MAPLManagedState(
             GFDL1MState.zeros(quantity_factory),
             transfer_type,

@@ -4,14 +4,19 @@
 
 from ndsl import State
 from ndsl.constants import X_DIM, Y_DIM, Z_DIM, Float
+from ndsl.optional_imports import cupy as cp
 from pyMoist.interface import InterfaceTransferType
 from pyMoist.interface.mapl.memory_factory import MAPLMemoryRepository
 
 
 class MAPLManagedState:
     """Manage a NDSL <> MAPL shared state by linking MAPL pointers to NDSL state fields"""
 
-    def __init__(self, py_state: State, transfer_type: InterfaceTransferType) -> None:
+    def __init__(
+        self,
+        py_state: State,
+        transfer_type: InterfaceTransferType,
+    ) -> None:
         self._ndsl_state = py_state
         self._state_to_mapl_mapping: dict[str, Tuple[MAPLMemoryRepository, str]] = {}
         self._transfer_type = transfer_type
@@ -62,10 +67,16 @@ def _pull_from_fortran(
                 )
             else:
                 mapl_array = mapl_state_.get_from_fortran(mapl_field_)
+                if self._transfer_type == InterfaceTransferType.CPU_TO_GPU_TO_CPU:
+                    cp.cuda.runtime.deviceSynchronize()
                 if mapl_array is None:
                     setattr(ndsl_state_, ndsl_field_, None)
-                else:
+                elif self._transfer_type == InterfaceTransferType.CPU_COPY:
                     getattr(ndsl_state_, ndsl_field_).field[:] = mapl_array[:]
+                elif self._transfer_type == InterfaceTransferType.CPU_MAP:
+                    getattr(ndsl_state_, ndsl_field_).data = mapl_array
+                else:
+                    raise ValueError("Transfer type unknown for Fortran/NDSL")
 
         for ndsl_field, (mapl_state, mapl_field) in self._state_to_mapl_mapping.items():
             try:
@@ -77,6 +88,10 @@ def _pull_from_fortran(
     def ndsl_to_fortran(self) -> None:
         """Copy all Python memory back in Fortan"""
 
+        # Skip sending back - we are mapped
+        if self._transfer_type == InterfaceTransferType.CPU_MAP:
+            return
+
         def _push_back_to_fortran(
             mapl_field_: str,
             mapl_state_: MAPLMemoryRepository,
@@ -93,10 +108,18 @@ def _push_back_to_fortran(
                 )
             else:
                 mapl_array = mapl_state_.get_from_fortran(mapl_field_)
-                if mapl_array is not None:
+                if mapl_array is None:
+                    pass
+                elif self._transfer_type == InterfaceTransferType.CPU_COPY:
                     ndsl_array = getattr(ndsl_state_, ndsl_field_).field[:]
                     mapl_array[:] = ndsl_array[:]
                     mapl_state_.send_to_fortran(mapl_field_)
+                elif self._transfer_type == InterfaceTransferType.CPU_MAP:
+                    raise RuntimeError("Coding issue. We should never send back mapped data")
+                else:
+                    raise ValueError("Transfer type unknown for NDSL/Fortran")
 
         for ndsl_field, (mapl_state, mapl_field) in self._state_to_mapl_mapping.items():
             _push_back_to_fortran(mapl_field, mapl_state, ndsl_field, self._ndsl_state)
+        if self._transfer_type == InterfaceTransferType.CPU_TO_GPU_TO_CPU:
+            cp.cuda.runtime.deviceSynchronize()
@@ -29,7 +29,7 @@ class FortranMemory:
         """Is the pointer properly associated in Fortran"""
         shape: tuple[int, ...]
         """Shape of the array"""
-        python_array: npt.NDArray
+        python_array: npt.NDArray | None
         """Synced python memory, check dirty"""
         dirty: bool = False
         """Does the memory need to be synced"""
@@ -71,24 +71,33 @@ def register(
             pointer=cast_ptr,
             associated=is_associated,
             shape=self._quantity_factory.sizer.get_extent(dims),
-            python_array=np.empty((0)),
+            python_array=None,
         )
 
     def get_from_fortran(
         self,
         name: str,
-    ) -> npt.NDArray:
+        *,
+        allow_device_transfer: bool = True,
+    ) -> npt.NDArray | None:
         """Retrieve the data from Fortran. Prefer using a MAPLManager."""
         try:
             fmem = self._fortran_pointers[name]
         except KeyError:
             raise KeyError(f"Pointer {name} was never registered.")
+
         if not fmem.associated:
-            return
+            return None
+
+        # We rely here on `fmem.pointer` constant, therefore we could
+        # go ahead an _not_ re-map. We don't because we want to introduce
+        # a ZERO TRUST mode where we don't rely on Fortran  being constant
+        #     return fmem.python_array
 
         fmem.python_array = self._f_py_converter.fortran_to_python(
             fptr=fmem.pointer,
-            dim=list(fmem.shape),
+            dims=list(fmem.shape),
+            allow_device_transfer=allow_device_transfer,
         )
 
         return fmem.python_array

@@ -7,5 +7,6 @@ class MemorySpace(enum.Enum):
 
 
 class InterfaceTransferType(enum.Enum):
-    ALL_CPU = enum.auto()
+    CPU_COPY = enum.auto()  # Copies because of layout mismatch
+    CPU_MAP = enum.auto()  # No copy - memory map - same layout
     CPU_TO_GPU_TO_CPU = enum.auto()