StashCalculation: a new CalcJob plugin (aiidateam#6772)

khsrali · web-flow · commit bc253236d7ba · 2025-04-30T11:32:40.000+02:00
Historically, stashing was only possible, if it was instructed before running a generic calcjob. The instruction had to be "attached" to the original calcjob, like this for example:

```python
inputs = {
    'MyInputs': &lt;MyInputs&gt;,
    'metadata': {
        'computer': Computer.collection.get(label="localhost"),
        'options': {
            'resources': {'num_machines': 1}, 
            'stash': {
                'stash_mode': StashMode.COPY.value,
                'target_base': '/scratch/',
                 'source_list': ['heavy_data.xyz'],
            },
        },
    },
}
run(MyCalculation, **inputs)

```

However, if a user would realize they need to stash something only after running a calcjob, this would not be possible.

This commit, introduces a new calcjob, which is able to perform a stashing operation after a calculation is finished.
The usage is very similar, and for consistency and user-friendliness, we keep the instruction as part of the metadata. The only main input is obviously a source node which is `RemoteData` node of the calculation to be stashed, for example:

```python
StashCalculation_ = CalculationFactory('core.stash')


MyCalculation = orm.load_node(pk=&lt;PK&gt;)
inputs = {
    'metadata': {
        'computer': Computer.collection.get(label="localhost"),
        'options': {
            'resources': {'num_machines': 1}, 
            'stash': {
                'stash_mode': StashMode.COPY.value,
                'target_base': '/scratch/',
                 'source_list': ['heavy_data.xyz'],
            },
        },
    },
    'source_node': orm.RemoteData,
}

result = run(StashCalculation_, **inputs)

```
diff --git a/pyproject.toml b/pyproject.toml
@@ -64,6 +64,7 @@ requires-python = '>=3.9'
 
 [project.entry-points.'aiida.calculations']
 'core.arithmetic.add' = 'aiida.calculations.arithmetic.add:ArithmeticAddCalculation'
+'core.stash' = 'aiida.calculations.stash:StashCalculation'
 'core.templatereplacer' = 'aiida.calculations.templatereplacer:TemplatereplacerCalculation'
 'core.transfer' = 'aiida.calculations.transfer:TransferCalculation'
 
diff --git a/src/aiida/calculations/stash.py b/src/aiida/calculations/stash.py
@@ -0,0 +1,81 @@
+###########################################################################
+# Copyright (c), The AiiDA team. All rights reserved.                     #
+# This file is part of the AiiDA code.                                    #
+#                                                                         #
+# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core #
+# For further information on the license, see the LICENSE.txt file        #
+# For further information please visit http://www.aiida.net               #
+###########################################################################
+""""""
+
+from aiida import orm
+from aiida.common.datastructures import CalcInfo
+from aiida.engine import CalcJob
+
+
+class StashCalculation(CalcJob):
+    """
+    Utility to stash files/folders from `RemoteData`, `SinglefileData`, or `FolderData`.
+
+    An example of how the input should look like:
+
+    .. code-block:: python
+
+        inputs = {
+            'metadata': {
+                'computer': Computer.collection.get(label="localhost"),
+                'options': {
+                    'resources': {'num_machines': 1},
+                    'stash': {
+                        'stash_mode': StashMode.COPY.value,
+                        'target_base': '/scratch/my_stashing/',
+                        'source_list': ['aiida.in', '_aiidasubmit.sh'],
+                    },
+                },
+            },
+            'source_node': node_1,
+        }
+
+    Ideally one could use the same computer as the one of the `source_node`.
+    However if you cannot access the stash storage from the same computer anymore
+    but you have access to it from another computer, you can can specify the computer in `metadata.computer`.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def define(cls, spec):
+        super().define(spec)
+
+        spec.input(
+            'source_node',
+            valid_type=orm.RemoteData,
+            required=True,
+            help='',
+        )
+
+        # Code is irrelevant for this calculation.
+        spec.inputs.pop('code', None)
+
+        spec.inputs['metadata']['computer'].required = True
+        spec.inputs['metadata']['options']['stash'].required = True
+        spec.inputs['metadata']['options']['stash']['stash_mode'].required = True
+        spec.inputs['metadata']['options']['stash']['target_base'].required = True
+        spec.inputs['metadata']['options']['stash']['source_list'].required = True
+        spec.inputs['metadata']['options']['resources'].default = {
+            'num_machines': 1,
+            'num_mpiprocs_per_machine': 1,
+        }
+
+    def prepare_for_submission(self, folder):
+        calc_info = CalcInfo()
+        calc_info.skip_submit = True
+
+        calc_info.codes_info = []
+        calc_info.retrieve_list = []
+        calc_info.local_copy_list = []
+        calc_info.remote_copy_list = []
+        calc_info.remote_symlink_list = []
+
+        return calc_info
diff --git a/src/aiida/calculations/transfer.py b/src/aiida/calculations/transfer.py
@@ -185,7 +185,7 @@ def define(cls, spec):
             help='All the nodes that contain files referenced in the instructions.',
         )
 
-        # The transfer just needs a computer, the code are resources are set here
+        # The transfer just needs a computer, the code and resources are set here
         spec.inputs.pop('code', None)
         spec.inputs['metadata']['computer'].required = True
         spec.inputs['metadata']['options']['resources'].default = {
diff --git a/src/aiida/engine/daemon/execmanager.py b/src/aiida/engine/daemon/execmanager.py
@@ -440,11 +440,19 @@ async def stash_calculation(calculation: CalcJobNode, transport: Transport) -> N
 
     logger_extra = get_dblogger_extra(calculation)
 
+    if calculation.process_type == 'aiida.calculations:core.stash':
+        remote_node = load_node(calculation.inputs.source_node.pk)
+        uuid = remote_node.uuid
+        source_basepath = Path(remote_node.get_remote_path())
+    else:
+        uuid = calculation.uuid
+        source_basepath = Path(calculation.get_remote_workdir())
+
     stash_options = calculation.get_option('stash')
     stash_mode = stash_options.get('stash_mode')
     source_list = stash_options.get('source_list', [])
-    uuid = calculation.uuid
-    source_basepath = Path(calculation.get_remote_workdir())
+    target_base = Path(stash_options['target_base'])
+    dereference = stash_options.get('dereference', False)
 
     if not source_list:
         return
@@ -454,7 +462,7 @@ async def stash_calculation(calculation: CalcJobNode, transport: Transport) -> N
     )
 
     if stash_mode == StashMode.COPY.value:
-        target_basepath = Path(stash_options['target_base']) / uuid[:2] / uuid[2:4] / uuid[4:]
+        target_basepath = target_base / uuid[:2] / uuid[2:4] / uuid[4:]
 
         for source_filename in source_list:
             if transport.has_magic(source_filename):
@@ -475,7 +483,7 @@ async def stash_calculation(calculation: CalcJobNode, transport: Transport) -> N
                 except (OSError, ValueError) as exception:
                     EXEC_LOGGER.warning(f'failed to stash {source_filepath} to {target_filepath}: {exception}')
                     # try to clean up in case of a failure
-                    await transport.rmtree_async(Path(stash_options['target_base']) / uuid[:2])
+                    await transport.rmtree_async(target_base / uuid[:2])
                 else:
                     EXEC_LOGGER.debug(f'stashed {source_filepath} to {target_filepath}')
 
@@ -496,12 +504,10 @@ async def stash_calculation(calculation: CalcJobNode, transport: Transport) -> N
         # 'tar', 'tar.gz', 'tar.bz2', or 'tar.xz'
         compression_format = stash_mode
         file_name = uuid
-        dereference = stash_options.get('dereference', False)
-        target_basepath = Path(stash_options['target_base'])
         authinfo = calculation.get_authinfo()
         aiida_remote_base = authinfo.get_workdir().format(username=transport.whoami())
 
-        target_destination = str(target_basepath / file_name) + '.' + compression_format
+        target_destination = str(target_base / file_name) + '.' + compression_format
 
         remote_stash = RemoteStashCompressedData(
             computer=calculation.computer,
diff --git a/src/aiida/engine/processes/calcjobs/tasks.py b/src/aiida/engine/processes/calcjobs/tasks.py
@@ -481,7 +481,30 @@ def load_instance_state(self, saved_state, load_context):
         self._killing = None
 
     async def execute(self) -> plumpy.process_states.State:  # type: ignore[override]
-        """Override the execute coroutine of the base `Waiting` state."""
+        """Override the execute coroutine of the base `Waiting` state.
+        Using the plumpy state machine the waiting state is repeatedly re-entered with different commands.
+        The waiting state is not always the same instance, it could be re-instantiated when re-entering this method,
+        therefor any newly created attribute in each command block
+        (e.g. `SUBMIT_COMMAND`, `UPLOAD_COMMAND`, etc.) will be lost, and is not usable in other blocks.
+        The advantage of this design, is that the sequence is interruptable,
+        meaning, the process can potentially come back and start from where it left off.
+
+        The overall sequence is as follows:
+        in case `skip_submit` is True:
+
+        UPLOAD -> STASH -> RETRIEVE
+        |   ^     |   ^     |   ^
+        v   |     v   |     v   |
+        .. ..     .. ..     .. ..
+
+        otherwise:
+
+        UPLOAD -> SUBMIT -> UPDATE -> STASH -> RETRIEVE
+        |   ^     |   ^     |   ^     |   ^     |   ^
+        v   |     v   |     v   |     v   |     v   |
+        .. ..     .. ..     .. ..     .. ..     .. ..
+        """
+
         node = self.process.node
         transport_queue = self.process.runner.transport
         result: plumpy.process_states.State = self
@@ -493,7 +516,7 @@ async def execute(self) -> plumpy.process_states.State:  # type: ignore[override
             if self._command == UPLOAD_COMMAND:
                 skip_submit = await self._launch_task(task_upload_job, self.process, transport_queue)
                 if skip_submit:
-                    result = self.retrieve(monitor_result=self._monitor_result)
+                    result = self.stash(monitor_result=self._monitor_result)
                 else:
                     result = self.submit()
 
diff --git a/tests/calculations/test_stash.py b/tests/calculations/test_stash.py
@@ -0,0 +1,52 @@
+###########################################################################
+# Copyright (c), The AiiDA team. All rights reserved.                     #
+# This file is part of the AiiDA code.                                    #
+#                                                                         #
+# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core #
+# For further information on the license, see the LICENSE.txt file        #
+# For further information please visit http://www.aiida.net               #
+###########################################################################
+"""Tests for the `StashCalculation` plugin.
+
+Note: testing the main functionality is done in via `test_execmanager.py`.
+Here, we mainly check for redirection, of the calcjob.
+"""
+
+import pytest
+
+from aiida import orm
+from aiida.common.datastructures import StashMode
+
+
+@pytest.mark.requires_rmq
+def test_stash_calculation_basic(fixture_sandbox, aiida_localhost, generate_calc_job, tmp_path):
+    """Test that the basic implementation of `StashCalculation` functions."""
+
+    target_base = tmp_path / 'target'
+    source = tmp_path / 'source'
+    source.mkdir()
+
+    inputs = {
+        'metadata': {
+            'computer': aiida_localhost,
+            'options': {
+                'resources': {'num_machines': 1},
+                'stash': {
+                    'stash_mode': StashMode.COPY.value,
+                    'target_base': str(target_base),
+                    'source_list': ['*'],
+                },
+            },
+        },
+        'source_node': orm.RemoteData(computer=aiida_localhost, remote_path=str(source)),
+    }
+    entry_point_name = 'core.stash'
+    calc_info = generate_calc_job(fixture_sandbox, entry_point_name, inputs)
+
+    assert calc_info.skip_submit is True
+
+    assert calc_info.codes_info == []
+    assert calc_info.retrieve_list == []
+    assert calc_info.local_copy_list == []
+    assert calc_info.remote_copy_list == []
+    assert calc_info.remote_symlink_list == []

Original file line number	Diff line number	Diff line change
`@@ -185,7 +185,7 @@ def define(cls, spec):`
`185`	`185`	`help='All the nodes that contain files referenced in the instructions.',`
`186`	`186`	`)`
`187`	`187`
`188`		`- # The transfer just needs a computer, the code are resources are set here`
	`188`	`+ # The transfer just needs a computer, the code and resources are set here`
`189`	`189`	`spec.inputs.pop('code', None)`
`190`	`190`	`spec.inputs['metadata']['computer'].required = True`
`191`	`191`	`spec.inputs['metadata']['options']['resources'].default = {`