From 948ffc5bdd478d57ec04b47e7e17d2064dc6bcc1 Mon Sep 17 00:00:00 2001 From: Alexander Goscinski Date: Fri, 9 May 2025 10:57:51 +0200 Subject: [PATCH 1/5] Regular killing reschedules a cancel of scheduler job (#6870) Squashed commit at 2025-05-09 21:53 PR #6793 introduced the cancelation of earlier kill actions. This had the problem if two kill commands are set in a sequence, the second kill action will cancel the first one which triggered the cancelation of the scheduler job within an EBM. The second kill command however did not retrigger the cancelation of the scheduler job. This bug appeared because we have two places where the killing logic is placed. More information about this can be found in PR #6868 that fixes this properly refactoring the kill action. This PR only serves as a fast temporary fix with workarounds. Before this PR, when the killing command failed through the EBM, the scheduler job could not be cancelled through a kill anymore. Since we have now force-kill option to bypass the EBM, we can reschedule the cancelation of the scheduler job to gracefully kill a process. --- src/aiida/engine/processes/calcjobs/tasks.py | 1 - src/aiida/engine/processes/process.py | 56 +++++++++++++- src/aiida/engine/utils.py | 1 + tests/cmdline/commands/test_process.py | 80 ++++++++++++++++++-- 4 files changed, 130 insertions(+), 8 deletions(-) diff --git a/src/aiida/engine/processes/calcjobs/tasks.py b/src/aiida/engine/processes/calcjobs/tasks.py index e70547f094..af5eb1e31d 100644 --- a/src/aiida/engine/processes/calcjobs/tasks.py +++ b/src/aiida/engine/processes/calcjobs/tasks.py @@ -582,7 +582,6 @@ async def execute(self) -> plumpy.process_states.State: # type: ignore[override except TransportTaskException as exception: raise plumpy.process_states.PauseInterruption(f'Pausing after failed transport task: {exception}') except plumpy.process_states.KillInterruption as exception: - await self._kill_job(node, transport_queue) node.set_process_status(str(exception)) return self.retrieve(monitor_result=self._monitor_result) except (plumpy.futures.CancelledError, asyncio.CancelledError): diff --git a/src/aiida/engine/processes/process.py b/src/aiida/engine/processes/process.py index edbeca8704..2cbbbb47c8 100644 --- a/src/aiida/engine/processes/process.py +++ b/src/aiida/engine/processes/process.py @@ -51,7 +51,9 @@ from aiida.common.lang import classproperty, override from aiida.common.links import LinkType from aiida.common.log import LOG_LEVEL_REPORT +from aiida.engine.utils import InterruptableFuture from aiida.orm.implementation.utils import clean_value +from aiida.orm.nodes.process.calculation.calcjob import CalcJobNode from aiida.orm.utils import serialize from .builder import ProcessBuilder @@ -72,6 +74,7 @@ class Process(PlumpyProcess): have full provenance saved in the database. """ + _cancelling_scheduler_job: asyncio.Task | None = None _node_class = orm.ProcessNode _spec_class = ProcessSpec @@ -336,10 +339,43 @@ def kill(self, msg_text: str | None = None, force_kill: bool = False) -> Union[b """ self.node.logger.info(f'Request to kill Process<{self.node.pk}>') - had_been_terminated = self.has_terminated() + # PR_COMMENT Because we need to overwrite the logic of the cancelation of the self._killing task of the + # scheduler job, we need to copy this logic of the parent class in plumpy, we need to adapt the + # cancelation of the last sent killing action to also resend the kill/cancelation of the scheduler + # job as we stop this canelation by canceling the last killing action + if self.killed(): + # Already killed + return True + + if self.has_terminated(): + # Can't kill + return False + + # Cancel scheduler job + if not force_kill and isinstance(self.node, CalcJobNode): + if self._killing: + self._killing.cancel() + + # PR_COMMENT: We cannot reuse _killing because of type issues, it is a CancellableAction. + # We can wrap a task around a CancellableAction but the CancellableAction catches silently any + # error whilel here we need to know if the cancelation of the scheduler job failed. + if self._cancelling_scheduler_job: + self._cancelling_scheduler_job.cancel() + + from .calcjobs.tasks import task_kill_job + + coro = self._launch_task(task_kill_job, self.node, self.runner.transport) + self._cancelling_scheduler_job = asyncio.create_task(coro) + try: + self.loop.run_until_complete(self._cancelling_scheduler_job) + except Exception as exc: + self.node.logger.error(f'While cancelling job error was raised: {exc!s}') + return False result = super().kill(msg_text, force_kill) + had_been_terminated = self.has_terminated() + # Only kill children if we could be killed ourselves if result is not False and not had_been_terminated: killing = [] @@ -374,6 +410,24 @@ def done(done_future: plumpy.futures.Future): return result + # PR_COMMENT This is a copy of the function in engine/processes/calcjobs/tasks.py + # and will merged to one place in PR #6868 + async def _launch_task(self, coro, *args, **kwargs): + """Launch a coroutine as a task, making sure to make it interruptable.""" + import functools + + from aiida.engine.utils import interruptable_task + + self._task: Union[InterruptableFuture, None] + + task_fn = functools.partial(coro, *args, **kwargs) + try: + self._task = interruptable_task(task_fn) + result = await self._task + return result + finally: + self._task = None + @override def out(self, output_port: str, value: Any = None) -> None: """Attach output to output port. diff --git a/src/aiida/engine/utils.py b/src/aiida/engine/utils.py index 86517a5ada..b383bfe086 100644 --- a/src/aiida/engine/utils.py +++ b/src/aiida/engine/utils.py @@ -193,6 +193,7 @@ async def exponential_backoff_retry( :param ignore_exceptions: exceptions to ignore, i.e. when caught do nothing and simply re-raise :return: result if the ``coro`` call completes within ``max_attempts`` retries without raising """ + if logger is None: logger = LOGGER diff --git a/tests/cmdline/commands/test_process.py b/tests/cmdline/commands/test_process.py index 411f013397..6c6709e544 100644 --- a/tests/cmdline/commands/test_process.py +++ b/tests/cmdline/commands/test_process.py @@ -25,6 +25,7 @@ from aiida.common.log import LOG_LEVEL_REPORT from aiida.engine import Process, ProcessState from aiida.engine.processes import control as process_control +from aiida.engine.utils import exponential_backoff_retry from aiida.orm import CalcJobNode, Group, WorkChainNode, WorkflowNode, WorkFunctionNode from tests.utils.processes import WaitProcess @@ -53,6 +54,7 @@ def start_daemon_worker_in_foreground_and_redirect_streams( try: pid = os.getpid() + # For easier debugging you can change these to stdout sys.stdout = open(log_dir / f'worker-{pid}.out', 'w') sys.stderr = open(log_dir / f'worker-{pid}.err', 'w') start_daemon_worker(False, aiida_profile_name) @@ -72,10 +74,22 @@ def mock_open(_): raise Exception('Mock open exception') @staticmethod - async def mock_exponential_backoff_retry(*_, **__): + async def exponential_backoff_retry_fail_upload(fct: t.Callable[..., t.Any], *args, **kwargs): from aiida.common.exceptions import TransportTaskException - raise TransportTaskException + if 'do_upload' in fct.__name__: + raise TransportTaskException + else: + return await exponential_backoff_retry(fct, *args, **kwargs) + + @staticmethod + async def exponential_backoff_retry_fail_kill(fct: t.Callable[..., t.Any], *args, **kwargs): + from aiida.common.exceptions import TransportTaskException + + if 'do_kill' in fct.__name__: + raise TransportTaskException + else: + return await exponential_backoff_retry(fct, *args, **kwargs) @pytest.fixture(scope='function') @@ -213,11 +227,12 @@ def make_a_builder(sleep_seconds=0): @pytest.mark.requires_rmq @pytest.mark.usefixtures('started_daemon_client') -def test_process_kill_failng_ebm( +def test_process_kill_failing_ebm_upload( fork_worker_context, submit_and_await, aiida_code_installed, run_cli_command, monkeypatch ): - """9) Kill a process that is paused after EBM (5 times failed). It should be possible to kill it normally. - # (e.g. in scenarios that transport is working again) + """Kill a process that is waiting after failed EBM during upload. It should be possible to kill it normally. + + A process that failed upload (e.g. in scenarios that transport is working again) and is then killed with """ from aiida.orm import Int @@ -232,7 +247,10 @@ def make_a_builder(sleep_seconds=0): kill_timeout = 10 - monkeypatch_args = ('aiida.engine.utils.exponential_backoff_retry', MockFunctions.mock_exponential_backoff_retry) + monkeypatch_args = ( + 'aiida.engine.utils.exponential_backoff_retry', + MockFunctions.exponential_backoff_retry_fail_upload, + ) with fork_worker_context(monkeypatch.setattr, monkeypatch_args): node = submit_and_await(make_a_builder(), ProcessState.WAITING) await_condition( @@ -241,10 +259,60 @@ def make_a_builder(sleep_seconds=0): timeout=kill_timeout, ) + # kill should start EBM and should successfully kill run_cli_command(cmd_process.process_kill, [str(node.pk), '--wait']) await_condition(lambda: node.is_killed, timeout=kill_timeout) +@pytest.mark.requires_rmq +@pytest.mark.usefixtures('started_daemon_client') +def test_process_kill_failing_ebm_kill( + fork_worker_context, submit_and_await, aiida_code_installed, run_cli_command, monkeypatch +): + """Kill a process that with a failng EBM during the kill. + + Killing a process tries to gracefully cancel the job on the remote node. If there are connection problems it retries + it in using the EBM. If this fails another kill command can be send to restart the cancelation of the job scheduler. + """ + from aiida.orm import Int + + code = aiida_code_installed(default_calc_job_plugin='core.arithmetic.add', filepath_executable='/bin/bash') + + def make_a_builder(sleep_seconds=0): + builder = code.get_builder() + builder.x = Int(1) + builder.y = Int(1) + builder.metadata.options.sleep = sleep_seconds + return builder + + kill_timeout = 10 + + monkeypatch_args = ( + 'aiida.engine.utils.exponential_backoff_retry', + MockFunctions.exponential_backoff_retry_fail_kill, + ) + # from aiida.engine.utils import exponential_backoff_retry + # monkeypatch_args = ('aiida.engine.utils.exponential_backoff_retry', exponential_backoff_retry) + with fork_worker_context(monkeypatch.setattr, monkeypatch_args): + node = submit_and_await(make_a_builder(kill_timeout + 10), ProcessState.WAITING, timeout=kill_timeout) + await_condition( + lambda: node.process_status == 'Monitoring scheduler: job state RUNNING', + timeout=kill_timeout, + ) + + # kill should start EBM and be not successful in EBM + run_cli_command(cmd_process.process_kill, [str(node.pk), '--wait']) + await_condition(lambda: not node.is_killed, timeout=kill_timeout) + + # kill should restart EBM and be not successful in EBM + run_cli_command(cmd_process.process_kill, [str(node.pk), '--wait']) + await_condition(lambda: not node.is_killed, timeout=kill_timeout) + + # force kill should skip EBM and successfully kill the process + run_cli_command(cmd_process.process_kill, [str(node.pk), '-F', '--wait']) + await_condition(lambda: node.is_killed, timeout=kill_timeout) + + class TestVerdiProcess: """Tests for `verdi process`.""" From 86a3410b15f064373af813bb839ee0e1c12472fa Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 1 May 2025 20:21:03 +0200 Subject: [PATCH 2/5] WIP: Profile data dumping (#6723) Squashed commit at 2025-05-09 21:54 Add config pydantic model Add detect.py Add group-node-mapping Add dump logger Add dump engine Add dump managers Add facades Add utils Add changes to CLI Add changes to init, disable mypy for feature for now Add changes to docs Add changes to and additional tests [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Fix bug in explicitly-groupd sub-workflows being filtered out for profile/group dumping Fix group validation exception on `verdi profile dump -G` and creation of empty dirs for deselected groups. --- .pre-commit-config.yaml | 1 + docs/source/howto/data.rst | 261 +++- docs/source/reference/command_line.rst | 4 +- src/aiida/cmdline/commands/cmd_group.py | 155 ++ src/aiida/cmdline/commands/cmd_process.py | 148 +- src/aiida/cmdline/commands/cmd_profile.py | 203 ++- src/aiida/cmdline/params/options/main.py | 163 ++- src/aiida/tools/dumping/__init__.py | 7 +- src/aiida/tools/dumping/config.py | 372 +++++ src/aiida/tools/dumping/detect.py | 838 +++++++++++ src/aiida/tools/dumping/engine.py | 276 ++++ src/aiida/tools/dumping/facades.py | 350 +++++ src/aiida/tools/dumping/logger.py | 536 +++++++ src/aiida/tools/dumping/managers/__init__.py | 0 src/aiida/tools/dumping/managers/deletion.py | 270 ++++ src/aiida/tools/dumping/managers/process.py | 768 ++++++++++ src/aiida/tools/dumping/managers/profile.py | 740 ++++++++++ src/aiida/tools/dumping/mapping.py | 212 +++ src/aiida/tools/dumping/processes.py | 426 ------ src/aiida/tools/dumping/utils.py | 75 - src/aiida/tools/dumping/utils/__init__.py | 0 src/aiida/tools/dumping/utils/helpers.py | 320 ++++ src/aiida/tools/dumping/utils/paths.py | 398 +++++ tests/cmdline/commands/test_process.py | 60 +- tests/conftest.py | 269 +++- tests/tools/dumping/__init__.py | 0 tests/tools/dumping/conftest.py | 30 + .../managers/test_process_dump_manager.py | 192 +++ tests/tools/dumping/test_facades.py | 1286 +++++++++++++++++ tests/tools/dumping/test_processes.py | 517 ------- tests/tools/dumping/utils.py | 174 +++ 31 files changed, 7926 insertions(+), 1125 deletions(-) create mode 100644 src/aiida/tools/dumping/config.py create mode 100644 src/aiida/tools/dumping/detect.py create mode 100644 src/aiida/tools/dumping/engine.py create mode 100644 src/aiida/tools/dumping/facades.py create mode 100644 src/aiida/tools/dumping/logger.py create mode 100644 src/aiida/tools/dumping/managers/__init__.py create mode 100644 src/aiida/tools/dumping/managers/deletion.py create mode 100644 src/aiida/tools/dumping/managers/process.py create mode 100644 src/aiida/tools/dumping/managers/profile.py create mode 100644 src/aiida/tools/dumping/mapping.py delete mode 100644 src/aiida/tools/dumping/processes.py delete mode 100644 src/aiida/tools/dumping/utils.py create mode 100644 src/aiida/tools/dumping/utils/__init__.py create mode 100644 src/aiida/tools/dumping/utils/helpers.py create mode 100644 src/aiida/tools/dumping/utils/paths.py create mode 100644 tests/tools/dumping/__init__.py create mode 100644 tests/tools/dumping/conftest.py create mode 100644 tests/tools/dumping/managers/test_process_dump_manager.py create mode 100644 tests/tools/dumping/test_facades.py delete mode 100644 tests/tools/dumping/test_processes.py create mode 100644 tests/tools/dumping/utils.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6252dd7513..a4c356aa4b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -191,6 +191,7 @@ repos: src/aiida/transports/plugins/local.py| src/aiida/transports/plugins/ssh.py| src/aiida/workflows/arithmetic/multiply_add.py| + src/aiida/tools/dumping/.*| )$ - id: generate-conda-environment diff --git a/docs/source/howto/data.rst b/docs/source/howto/data.rst index 19b8391db0..95010e40b5 100644 --- a/docs/source/howto/data.rst +++ b/docs/source/howto/data.rst @@ -81,7 +81,10 @@ For details refer to the next section :ref:`"How to add support for custom data .. _how-to:data:dump: Dumping data to disk --------------------- +==================== + +Profile dumping +--------------- .. versionadded:: 2.6 @@ -148,6 +151,262 @@ subdirectories. For a full list of available options, call :code:`verdi process dump --help`. +Group Dumping +------------- + +.. versionadded:: 2.7 + +The functionality has been expanded to also dump data from groups: + +.. code-block:: shell + + verdi group dump + +This command will create a directory structure with all processes contained in the specified group. For example: + +.. code-block:: shell + + $ verdi group dump my-calculations + Warning: This is a new feature which is still in its testing phase. If you encounter unexpected behavior or bugs, please reach out via Discourse. + Report: No config file found. Using command-line arguments. + Report: Starting dump process of group `my-calculations` in mode: INCREMENTAL + Report: Processing group changes... + Report: Processing 1 new or modified groups: ['my-calculations'] + Report: Dumping 1 nodes for group 'my-calculations' + Report: Saving final dump log, mapping, and configuration... + Success: Raw files for group `my-calculations` dumped into folder `group-my-calculations-dump`. + +Will result in the following output directory: + +.. code-block:: shell + + $ tree -a group-my-calculations-dump/ + group-my-calculations-dump + ├── .aiida_dump_safeguard + ├── aiida_dump_config.yaml + ├── aiida_dump_log.json + └── calculations + └── ArithmeticAddCalculation-4 + ├── .aiida_dump_safeguard + ├── .aiida_node_metadata.yaml + ├── inputs + │ ├── .aiida + │ │ ├── calcinfo.json + │ │ └── job_tmpl.json + │ ├── _aiidasubmit.sh + │ └── aiida.in + └── outputs + ├── _scheduler-stderr.txt + ├── _scheduler-stdout.txt + └── aiida.out + +Similarly for a group ``my-workflows`` with a ``MultiplyAddWorkChain``: + +.. code-block:: shell + + $ verdi group dump my-calculations + Warning: This is a new feature which is still in its testing phase. If you encounter unexpected behavior or bugs, please reach out via Discourse. + Report: No config file found. Using command-line arguments. + Report: Starting dump process of group `my-workflows` in mode: INCREMENTAL + Report: Processing group changes... + Report: Processing 1 new or modified groups: ['my-workflows'] + Report: Dumping 1 nodes for group 'my-workflows' + Report: Saving final dump log, mapping, and configuration... + Success: Raw files for group `my-workflows` dumped into folder `group-my-workflows-dump`. + +And the following output directory: + +.. code-block:: shell + + $ tree -a group-my-workflows-dump/ + group-my-workflows-dump + ├── .aiida_dump_safeguard + ├── aiida_dump_config.yaml + ├── aiida_dump_log.json + └── workflows + └── MultiplyAddWorkChain-11 + ├── .aiida_dump_safeguard + ├── .aiida_node_metadata.yaml + ├── 01-multiply-12 + │ ├── .aiida_dump_safeguard + │ ├── .aiida_node_metadata.yaml + │ └── inputs + │ └── source_file + └── 02-ArithmeticAddCalculation-14 + ├── .aiida_dump_safeguard + ├── .aiida_node_metadata.yaml + ├── inputs + │ ├── .aiida + │ │ ├── calcinfo.json + │ │ └── job_tmpl.json + │ ├── _aiidasubmit.sh + │ └── aiida.in + └── outputs + ├── _scheduler-stderr.txt + ├── _scheduler-stdout.txt + └── aiida.out + +Profile Dumping +--------------- + +.. versionadded:: 2.7 + +And, going even further, you can now also dump your data from an entire AiiDA profile. +If no options are provided, by default, no data is being dumped: + +.. code-block:: shell + + $ verdi profile dump + Warning: This is a new feature which is still in its testing phase. If you encounter unexpected behavior or bugs, please reach out via Discourse. + Report: No config file found. Using command-line arguments. + Warning: No specific data selection determined from config file or CLI arguments. + Warning: Please specify `--all` to dump all profile data or filters such as `groups`, `user` etc. + Warning: Use `--help` for all options and `--dry-run` to preview. + +This is to avoid accidentally initiating the dumping operation on a large AiiDA database. +Instead, if all data of the profile should be dumped, use the ``--all`` flag, or select a subset of your AiiDA data +using ``--groups``, ``--user``, as well as the various time-based filter options the command provides. + +If we run with ``--all`` on our current profile, we get the following result: + +.. code-block:: shell + + $ verdi profile dump --all + Warning: This is a new feature which is still in its testing phase. If you encounter unexpected behavior or bugs, please reach out via Discourse. + Report: No config file found. Using command-line arguments. + Report: Starting dump process of default profile in mode: INCREMENTAL + Report: Processing group changes... + Report: Processing 2 new or modified groups: ['my-calculations', 'my-workflows'] + Report: Dumping 1 nodes for group 'my-calculations' + Report: Dumping 1 nodes for group 'my-workflows' + Report: Saving final dump log, mapping, and configuration... + Success: Raw files for profile `docs` dumped into folder `profile-docs-dump`. + +The resulting directory preserves the group organization: + +.. code-block:: shell + + $ tree -a profile-docs-dump/ + profile-docs-dump + ├── .aiida_dump_safeguard + ├── aiida_dump_config.yaml + ├── aiida_dump_log.json + └── groups + ├── my-calculations + │ ├── .aiida_dump_safeguard + │ └── calculations + │ └── ArithmeticAddCalculation-4 + │ ├── .aiida_dump_safeguard + │ ├── .aiida_node_metadata.yaml + │ ├── inputs + │ │ ├── .aiida + │ │ │ ├── calcinfo.json + │ │ │ └── job_tmpl.json + │ │ ├── _aiidasubmit.sh + │ │ └── aiida.in + │ └── outputs + │ ├── _scheduler-stderr.txt + │ ├── _scheduler-stdout.txt + │ └── aiida.out + └── my-workflows + ├── .aiida_dump_safeguard + └── workflows + └── MultiplyAddWorkChain-11 + ├── .aiida_dump_safeguard + ├── .aiida_node_metadata.yaml + ├── 01-multiply-12 + │ ├── .aiida_dump_safeguard + │ ├── .aiida_node_metadata.yaml + │ └── inputs + │ └── source_file + └── 02-ArithmeticAddCalculation-14 + ├── .aiida_dump_safeguard + ├── .aiida_node_metadata.yaml + ├── inputs + │ ├── .aiida + │ │ ├── calcinfo.json + │ │ └── job_tmpl.json + │ ├── _aiidasubmit.sh + │ └── aiida.in + └── outputs + ├── _scheduler-stderr.txt + ├── _scheduler-stdout.txt + └── aiida.out + +.. Common Options +.. ------------ + +.. All three commands (``verdi process dump``, ``verdi group dump``, and ``verdi profile dump``) support various options: + +.. - ``-p/--path PATH``: Specify a custom dumping path +.. - ``-o/--overwrite``: Fully overwrite an existing dumping directory +.. - ``--include-inputs/--exclude-inputs``: Include/exclude linked input nodes +.. - ``--include-outputs/--exclude-outputs``: Include/exclude linked output nodes +.. - ``--include-attributes/--exclude-attributes``: Include/exclude node attributes +.. - ``--include-extras/--exclude-extras``: Include/exclude node extras +.. - ``-f/--flat``: Dump files in a flat directory structure +.. - ``--dump-unsealed/--no-dump-unsealed``: Allow/disallow dumping of unsealed process nodes + +.. For group and profile dumping, additional options include: + +.. - ``--filter-by-last-dump-time/--no-filter-by-last-dump-time``: Only dump nodes modified since last dump +.. - ``--dump-processes/--no-dump-processes``: Control process dumping +.. - ``--only-top-level-calcs/--no-only-top-level-calcs``: Control calculation directory creation +.. - ``--only-top-level-workflows/--no-only-top-level-workflows``: Control workflow directory creation +.. - ``--symlink-calcs/--no-symlink-calcs``: Use symlinks for duplicate calculations to avoid data duplication + +.. For a full list of available options, call ``verdi process dump --help``, ``verdi group dump --help``, or ``verdi profile dump --help``. + +.. Incremental Dumping +.. ---------------~~ + +.. By default, all dump commands operate in incremental mode, which means they only process nodes that are new or have been modified since the last dump operation. This makes the feature efficient when run repeatedly: + +.. .. code-block:: shell + +.. $ verdi group dump my-calculations +.. Report: No (new) calculations to dump in group `my-calculations`. +.. Report: No (new) workflows to dump in group `my-calculations`. +.. Success: Raw files for group `my-calculations` dumped into folder `my-calculations-dump`. + +Python API +---------- + +The dump functionality is also available through a Python API: + +.. code-block:: python + + # Dump a single process + from aiida import orm, load_profile + from aiida.tools.dump.process import ProcessDump + + load_profile() + process_node = orm.load_node(4) # ArithmeticAddCalculation node + process_dump = ProcessDump(process_node=process_node) + process_dump.dump() + + # Dump a group + from aiida.tools.dump.group import GroupDump + group = orm.load_group('my-calculations') + group_dump = GroupDump(group=group) + group_dump.dump() + + # Dump a profile + from aiida.tools.dump.profile import ProfileDump + profile_dump = ProfileDump() + profile_dump.dump() + +Usage Scenarios +------------~~ + +The data dumping functionality was designed to bridge the gap between research conducted with AiiDA and scientists not familiar with AiiDA. Some common use cases include: + +1. Sharing simulation results with collaborators who don't use AiiDA +2. Periodically running the dump command to reflect changes while working on a project +3. Analyzing data using traditional shell tools outside of AiiDA's programmatic approach + +### + .. _how-to:data:import:provenance: Provenance diff --git a/docs/source/reference/command_line.rst b/docs/source/reference/command_line.rst index 2b749fd3ae..c8dc5f88bc 100644 --- a/docs/source/reference/command_line.rst +++ b/docs/source/reference/command_line.rst @@ -223,6 +223,7 @@ Below is a list with all available subcommands. create Create an empty group with a given label. delete Delete groups and (optionally) the nodes they contain. description Change the description of a group. + dump Dump data of an AiiDA group to disk. list Show a list of existing groups. move-nodes Move the specified NODES from one group to another. path Inspect groups of nodes, with delimited label paths. @@ -397,6 +398,7 @@ Below is a list with all available subcommands. Commands: configure-rabbitmq Configure RabbitMQ for a profile. delete Delete one or more profiles. + dump Dump all data in an AiiDA profile's storage to disk. list Display a list of all available profiles. set-default Set a profile as the default profile. setdefault (Deprecated) Set a profile as the default profile. @@ -451,7 +453,7 @@ Below is a list with all available subcommands. --broker-host HOSTNAME Hostname for the message broker. [default: 127.0.0.1] --broker-port INTEGER Port for the message broker. [default: 5672] --broker-virtual-host TEXT Name of the virtual host for the message broker without - leading forward slash. [default: ""] + leading forward slash. --repository DIRECTORY Absolute path to the file repository. --test-profile Designate the profile to be used for running the test suite only. diff --git a/src/aiida/cmdline/commands/cmd_group.py b/src/aiida/cmdline/commands/cmd_group.py index 4f340b386a..c0e4764913 100644 --- a/src/aiida/cmdline/commands/cmd_group.py +++ b/src/aiida/cmdline/commands/cmd_group.py @@ -325,6 +325,11 @@ def group_relabel(group, label): echo.echo_critical(str(exception)) else: echo.echo_success(f"Label changed to '{label}'") + msg = ( + 'Note that if you are dumping your profile data to disk, to reflect the relabeling of the group, ' + 'run your `verdi profile dump` command again.' + ) + echo.echo_report(msg) @verdi_group.command('description') @@ -632,3 +637,153 @@ def group_path_ls(path, type_string, recursive, as_table, no_virtual, with_descr if no_virtual and child.is_virtual: continue echo.echo(child.path, bold=not child.is_virtual) + + +@verdi_group.command('dump') +@arguments.GROUP() +@options.PATH() +@options.DRY_RUN() +@options.OVERWRITE() +@options.PAST_DAYS() +@options.START_DATE() +@options.END_DATE() +@options.FILTER_BY_LAST_DUMP_TIME() +@options.DUMP_PROCESSES() +@options.DUMP_DATA() +@options.ONLY_TOP_LEVEL_CALCS() +@options.ONLY_TOP_LEVEL_WORKFLOWS() +@options.DELETE_MISSING() +@options.SYMLINK_CALCS() +@options.INCLUDE_INPUTS() +@options.INCLUDE_OUTPUTS() +@options.INCLUDE_ATTRIBUTES() +@options.INCLUDE_EXTRAS() +@options.FLAT() +@options.DUMP_UNSEALED() +@click.pass_context +@with_dbenv() +def group_dump( + ctx, + group, + path, + dry_run, + overwrite, + past_days, + start_date, + end_date, + filter_by_last_dump_time, + dump_processes, + dump_data, + delete_missing, + only_top_level_calcs, + only_top_level_workflows, + symlink_calcs, + include_inputs, + include_outputs, + include_attributes, + include_extras, + flat, + dump_unsealed, +): + """Dump data of an AiiDA group to disk. + + If 'aiida_dump_config.yaml' exists in the target directory, it will be used + as the sole source of configuration settings, ignoring other CLI flags. + Otherwise, CLI flags will be used. + """ + import traceback + + from pydantic import ValidationError + + from aiida.cmdline.utils import echo + from aiida.common import NotExistent + from aiida.tools.dumping import GroupDumper + from aiida.tools.dumping.config import DumpConfig, DumpMode + from aiida.tools.dumping.utils.paths import DumpPaths + + warning_msg = ( + 'This is a new feature which is still in its testing phase. ' + 'If you encounter unexpected behavior or bugs, please report them via Discourse or GitHub.' + ) + echo.echo_warning(warning_msg) + + # --- Initial Setup --- + final_dump_config = None + try: + dump_paths = DumpPaths._resolve_click_path_for_dump(path=path, entity=group) + config_file_path = dump_paths.config_path + + if config_file_path.is_file(): + # --- Config File Exists: Load ONLY from file --- + try: + config_path_rel = config_file_path.relative_to(dump_paths.top_level.parent) + except ValueError: + config_path_rel = config_file_path + echo.echo_report(f"Config file found at '{config_path_rel}'.") + echo.echo_report('Using config file settings ONLY (ignoring other CLI flags).') + try: + final_dump_config = DumpConfig.parse_yaml_file(config_file_path) + if final_dump_config.groups not in ([group.uuid], [group.label]): + msg = ( + f"Config file specifies groups '{final_dump_config.groups}', " + f"Overriding to dump only group '{group.label}'." + ) + echo.echo_warning(msg) + final_dump_config.groups = [group.uuid] + + except (ValidationError, FileNotFoundError, ValueError) as e: + echo.echo_critical(f'Error loading or validating config file {config_file_path}: {e}') + return + else: + # --- Config File Does NOT Exist: Use ONLY CLI args --- + echo.echo_report('No config file found. Using command-line arguments.') + try: + # Gather relevant CLI args for group dump + config_input_data = { + 'dry_run': dry_run, + 'overwrite': overwrite, + 'groups': [group.uuid], # Set group explicitly here + 'past_days': past_days, + 'start_date': start_date, + 'end_date': end_date, + 'filter_by_last_dump_time': filter_by_last_dump_time, + 'dump_processes': dump_processes, + 'dump_data': dump_data, + 'only_top_level_calcs': only_top_level_calcs, + 'only_top_level_workflows': only_top_level_workflows, + 'delete_missing': delete_missing, + 'symlink_calcs': symlink_calcs, + 'include_inputs': include_inputs, + 'include_outputs': include_outputs, + 'include_attributes': include_attributes, + 'include_extras': include_extras, + 'flat': flat, + 'dump_unsealed': dump_unsealed, + } + final_dump_config = DumpConfig.model_validate(config_input_data) + except ValidationError as e: + echo.echo_critical(f'Invalid command-line arguments provided:\n{e}') + return + + # --- Logical Checks --- + if final_dump_config.dump_mode == DumpMode.DRY_RUN and overwrite: + msg = ( + '`--dry-run` and `--overwrite` selected (or set in config). Overwrite operation will NOT be performed.' + ) + echo.echo_warning(msg) + + # --- Instantiate and Run GroupDumper --- + group_dumper = GroupDumper(group=group, config=final_dump_config, output_path=dump_paths.top_level) + group_dumper.dump() + + if final_dump_config.dump_mode != DumpMode.DRY_RUN: + msg = f'Raw files for group `{group.label}` dumped into folder `{dump_paths.child}`.' + echo.echo_success(msg) + else: + echo.echo_success('Dry run completed.') + + except NotExistent as e: + echo.echo_critical(f'Error: Required AiiDA entity not found: {e!s}') + except Exception as e: + msg = f'Unexpected error during dump of group {group.label}:\n ({e!s}).\n' + echo.echo_critical(msg + traceback.format_exc()) diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index cf925f2507..681a28d412 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -14,6 +14,7 @@ from aiida.cmdline.params import arguments, options, types from aiida.cmdline.params.options.overridable import OverridableOption from aiida.cmdline.utils import decorators, echo +from aiida.cmdline.utils.decorators import with_dbenv from aiida.common.log import LOG_LEVELS, capture_logging REPAIR_INSTRUCTIONS = """\ @@ -583,50 +584,21 @@ def process_repair(manager, broker, dry_run): @verdi_process.command('dump') @arguments.PROCESS() @options.PATH() +@options.DRY_RUN() @options.OVERWRITE() -@click.option( - '--include-inputs/--exclude-inputs', - default=True, - show_default=True, - help='Include the linked input nodes of the `CalculationNode`(s).', -) -@click.option( - '--include-outputs/--exclude-outputs', - default=False, - show_default=True, - help='Include the linked output nodes of the `CalculationNode`(s).', -) -@click.option( - '--include-attributes/--exclude-attributes', - default=True, - show_default=True, - help='Include attributes in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', -) -@click.option( - '--include-extras/--exclude-extras', - default=True, - show_default=True, - help='Include extras in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', -) -@click.option( - '-f', - '--flat', - is_flag=True, - default=False, - show_default=True, - help='Dump files in a flat directory for every step of the workflow.', -) -@click.option( - '--dump-unsealed', - is_flag=True, - default=False, - show_default=True, - help='Also allow the dumping of unsealed process nodes.', -) -@options.INCREMENTAL() +@options.INCLUDE_INPUTS() +@options.INCLUDE_OUTPUTS() +@options.INCLUDE_ATTRIBUTES() +@options.INCLUDE_EXTRAS() +@options.FLAT() +@options.DUMP_UNSEALED() +@click.pass_context +@with_dbenv() def process_dump( + ctx, process, path, + dry_run, overwrite, include_inputs, include_outputs, @@ -634,13 +606,12 @@ def process_dump( include_extras, flat, dump_unsealed, - incremental, ) -> None: """Dump process input and output files to disk. Child calculations/workflows (also called `CalcJob`s/`CalcFunction`s and `WorkChain`s/`WorkFunction`s in AiiDA jargon) run by the parent workflow are contained in the directory tree as sub-folders and are sorted by their - creation time. The directory tree thus mirrors the logical execution of the workflow, which can also be queried by + creation time. The directory tree thus dumps the logical execution of the workflow, which can also be queried by running `verdi process status ` on the command line. By default, input and output files of each calculation can be found in the corresponding "inputs" and @@ -651,30 +622,83 @@ def process_dump( Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant AiiDA node data for further inspection. """ + import traceback + from pydantic import ValidationError + + from aiida.cmdline.utils import echo + from aiida.common import NotExistent from aiida.tools.archive.exceptions import ExportValidationError - from aiida.tools.dumping.processes import ProcessDumper - - process_dumper = ProcessDumper( - include_inputs=include_inputs, - include_outputs=include_outputs, - include_attributes=include_attributes, - include_extras=include_extras, - overwrite=overwrite, - flat=flat, - dump_unsealed=dump_unsealed, - incremental=incremental, + from aiida.tools.dumping import ProcessDumper + from aiida.tools.dumping.config import DumpConfig, DumpMode + from aiida.tools.dumping.utils.paths import DumpPaths + + warning_msg = ( + 'The backend implementation of this command was recently refactored. ' + 'If you encounter unexpected behavior or bugs, please reach out via Discourse.' ) + echo.echo_warning(warning_msg) + + # --- Initial Setup --- + final_dump_config = None try: - dump_path = process_dumper.dump(process_node=process, output_path=path) - except FileExistsError: - echo.echo_critical( - 'Dumping directory exists and overwrite is False. Set overwrite to True, or delete directory manually.' - ) + dump_paths = DumpPaths._resolve_click_path_for_dump(path=path, entity=process) + config_file_path = dump_paths.config_path + + if config_file_path.is_file(): + # Config File Exists: Load ONLY from file + try: + config_path_rel = config_file_path.relative_to(dump_paths.top_level.parent) + except ValueError: + config_path_rel = config_file_path + echo.echo_report(f"Config file found at '{config_path_rel}'.") + echo.echo_report('Using config file settings ONLY (ignoring other CLI flags).') + try: + final_dump_config = DumpConfig.parse_yaml_file(config_file_path) + except (ValidationError, FileNotFoundError, ValueError) as e: + echo.echo_critical(f'Error loading or validating config file {config_file_path}: {e}') + return + else: + # Config File Does NOT Exist: Use ONLY CLI args + echo.echo_report('No config file found. Using command-line arguments.') + try: + # Gather relevant CLI args specific to process dump + config_input_data = { + 'dry_run': dry_run, + 'overwrite': overwrite, + 'include_inputs': include_inputs, + 'include_outputs': include_outputs, + 'include_attributes': include_attributes, + 'include_extras': include_extras, + 'flat': flat, + 'dump_unsealed': dump_unsealed, + } + final_dump_config = DumpConfig.model_validate(config_input_data) + except ValidationError as e: + echo.echo_critical(f'Invalid command-line arguments provided:\n{e}') + return + + # --- Logical Checks --- + # Check for dry_run + overwrite based on final config + if overwrite and dry_run: + msg = 'Config specifies both `dry_run` and `overwrite` as True. Overwrite will NOT be performed.' + echo.echo_warning(msg) + + # --- Instantiate and Run ProcessDumper --- + process_dumper = ProcessDumper(process=process, config=final_dump_config, output_path=dump_paths.top_level) + process_dumper.dump() + + if final_dump_config.dump_mode != DumpMode.DRY_RUN: + msg = f'Raw files for process `{process.pk}` dumped into folder `{dump_paths.child}`.' + echo.echo_success(msg) + else: + echo.echo_success('Dry run completed.') + + except NotExistent as e: + echo.echo_critical(f'Error: Required AiiDA entity not found: {e!s}') except ExportValidationError as e: - echo.echo_critical(f'{e!s}') + echo.echo_critical(f'Data validation error during dump: {e!s}') except Exception as e: - echo.echo_critical(f'Unexpected error while dumping {process.__class__.__name__} <{process.pk}>:\n ({e!s}).') - - echo.echo_success(f'Raw files for {process.__class__.__name__} <{process.pk}> dumped into folder `{dump_path}`.') + msg = f'Unexpected error during dump of process {process.pk}:\n ({e!s}).\n' + echo.echo_critical(msg + traceback.format_exc()) diff --git a/src/aiida/cmdline/commands/cmd_profile.py b/src/aiida/cmdline/commands/cmd_profile.py index a3f234f791..410d3dc2d9 100644 --- a/src/aiida/cmdline/commands/cmd_profile.py +++ b/src/aiida/cmdline/commands/cmd_profile.py @@ -17,6 +17,7 @@ from aiida.cmdline.params import arguments, options from aiida.cmdline.params.options.commands import setup from aiida.cmdline.utils import defaults, echo +from aiida.cmdline.utils.decorators import with_dbenv from aiida.common import exceptions from aiida.manage.configuration import Profile, create_profile, get_config @@ -58,7 +59,10 @@ def command_create_profile( from aiida.plugins.entry_point import get_entry_point_from_class if not storage_cls.read_only and email is None: - raise click.BadParameter('The option is required for storages that are not read-only.', param_hint='--email') + raise click.BadParameter( + 'The option is required for storages that are not read-only.', + param_hint='--email', + ) _, storage_entry_point = get_entry_point_from_class(storage_cls.__module__, storage_cls.__name__) assert storage_entry_point is not None @@ -94,7 +98,12 @@ def command_create_profile( broker_backend=broker_backend, broker_config=broker_config, ) - except (ValueError, TypeError, exceptions.EntryPointError, exceptions.StorageMigrationError) as exception: + except ( + ValueError, + TypeError, + exceptions.EntryPointError, + exceptions.StorageMigrationError, + ) as exception: echo.echo_critical(str(exception)) echo.echo_success(f'Created new profile `{profile.name}`.') @@ -164,7 +173,10 @@ def profile_list(): """Display a list of all available profiles.""" try: config = get_config() - except (exceptions.MissingConfigurationError, exceptions.ConfigurationError) as exception: + except ( + exceptions.MissingConfigurationError, + exceptions.ConfigurationError, + ) as exception: # This can happen for a fresh install and the `verdi setup` has not yet been run. In this case it is still nice # to be able to see the configuration directory, for instance for those who have set `AIIDA_PATH`. This way # they can at least verify that it is correctly set. @@ -224,7 +236,10 @@ def profile_set_default(profile): def _profile_set_default(profile): try: config = get_config() - except (exceptions.MissingConfigurationError, exceptions.ConfigurationError) as exception: + except ( + exceptions.MissingConfigurationError, + exceptions.ConfigurationError, + ) as exception: echo.echo_critical(str(exception)) config.set_default_profile(profile.name, overwrite=True).store() @@ -260,7 +275,10 @@ def profile_delete(force, delete_data, profiles): echo.echo_warning(f'Deleting profile `{profile.name}`, {suffix} all data.') if not force: - echo.echo_warning('This operation cannot be undone, are you sure you want to continue?', nl=False) + echo.echo_warning( + 'This operation cannot be undone, are you sure you want to continue?', + nl=False, + ) if not force and not click.confirm(''): echo.echo_report(f'Deleting of `{profile.name}` cancelled.') @@ -268,3 +286,178 @@ def profile_delete(force, delete_data, profiles): get_config().delete_profile(profile.name, delete_storage=delete_data) echo.echo_success(f'Profile `{profile.name}` was deleted.') + + +@verdi_profile.command('dump') +@options.PATH() +@options.DRY_RUN() +@options.OVERWRITE() +@options.ALL() +@options.CODES() +@options.COMPUTERS() +@options.GROUPS() +@options.USER() +@options.PAST_DAYS() +@options.START_DATE() +@options.END_DATE() +@options.FILTER_BY_LAST_DUMP_TIME() +@options.DUMP_PROCESSES() +@options.DUMP_DATA() +@options.ONLY_TOP_LEVEL_CALCS() +@options.ONLY_TOP_LEVEL_WORKFLOWS() +@options.DELETE_MISSING() +@options.SYMLINK_CALCS() +@options.ORGANIZE_BY_GROUPS() +@options.ALSO_UNGROUPED() +@options.UPDATE_GROUPS() +@options.INCLUDE_INPUTS() +@options.INCLUDE_OUTPUTS() +@options.INCLUDE_ATTRIBUTES() +@options.INCLUDE_EXTRAS() +@options.FLAT() +@options.DUMP_UNSEALED() +@click.pass_context +@with_dbenv() +def profile_dump( + ctx, + path, + dry_run, + overwrite, + all_entries, + codes, + computers, + groups, + user, + past_days, + start_date, + end_date, + filter_by_last_dump_time, + dump_processes, + dump_data, + only_top_level_calcs, + only_top_level_workflows, + delete_missing, + symlink_calcs, + organize_by_groups, + also_ungrouped, + update_groups, + include_inputs, + include_outputs, + include_attributes, + include_extras, + flat, + dump_unsealed, +): + """Dump all data in an AiiDA profile's storage to disk. + + If 'aiida_dump_config.yaml' exists in the target directory, it will be used + as the sole source of configuration settings, ignoring other CLI flags. + Otherwise, CLI flags will be used. + """ + import traceback + + from pydantic import ValidationError + + from aiida.cmdline.utils import echo + from aiida.common import NotExistent + from aiida.tools.dumping import ProfileDumper + from aiida.tools.dumping.config import DumpConfig, DumpMode, ProfileDumpSelection + from aiida.tools.dumping.utils.paths import DumpPaths + + warning_msg = ( + 'This is a new feature which is still in its testing phase. ' + 'If you encounter unexpected behavior or bugs, please report them via Discourse or GitHub.' + ) + echo.echo_warning(warning_msg) + + # --- Initial Setup --- + profile = ctx.obj['profile'] + final_dump_config = None + try: + dump_paths = DumpPaths._resolve_click_path_for_dump(path=path, entity=profile) + config_file_path = dump_paths.config_path + + if config_file_path.is_file(): + # --- Config File Exists: Load ONLY from file --- + try: + config_path_rel = config_file_path.relative_to(dump_paths.top_level.parent) + except ValueError: + config_path_rel = config_file_path # Fallback if not relative + echo.echo_report(f"Config file found at '{config_path_rel}'.") + echo.echo_report('Using config file settings ONLY (ignoring other CLI flags).') + try: + # Validate and create config SOLELY from the file + final_dump_config = DumpConfig.parse_yaml_file(config_file_path) + except (ValidationError, FileNotFoundError, ValueError) as e: + echo.echo_critical(f'Error loading or validating config file {config_file_path}: {e}') + return + else: + # --- Config File Does NOT Exist: Use ONLY CLI args --- + echo.echo_report('No config file found. Using command-line arguments.') + try: + # Gather all relevant CLI args here + config_input_data = { + 'dry_run': dry_run, + 'overwrite': overwrite, + 'all_entries': all_entries, + 'groups': list(groups), + 'past_days': past_days, + 'start_date': start_date, + 'end_date': end_date, + 'user': user, + 'codes': codes, + 'computers': computers, + 'filter_by_last_dump_time': filter_by_last_dump_time, + 'dump_processes': dump_processes, + 'dump_data': dump_data, + 'only_top_level_calcs': only_top_level_calcs, + 'only_top_level_workflows': only_top_level_workflows, + 'delete_missing': delete_missing, + 'symlink_calcs': symlink_calcs, + 'organize_by_groups': organize_by_groups, + 'also_ungrouped': also_ungrouped, + 'update_groups': update_groups, + 'include_inputs': include_inputs, + 'include_outputs': include_outputs, + 'include_attributes': include_attributes, + 'include_extras': include_extras, + 'flat': flat, + 'dump_unsealed': dump_unsealed, + } + final_dump_config = DumpConfig.model_validate(config_input_data) + except ValidationError as e: + echo.echo_critical(f'Invalid command-line arguments provided:\n{e}') + return + + # --- Check final determined scope (applies to both paths) --- + if final_dump_config.profile_dump_selection == ProfileDumpSelection.NONE: + echo.echo_warning('No specific data selection determined from config file or CLI arguments.') + msg = 'Please specify `--all` to dump all profile data or filters such as `groups`, `user` etc.' + echo.echo_warning(msg) + echo.echo_warning('Use `--help` for all options and `--dry-run` to preview.') + return + + # --- Other logical checks --- + if not final_dump_config.organize_by_groups and final_dump_config.update_groups: + echo.echo_warning('`update_groups` is True, but `organize_by_groups` is False.') + if final_dump_config.dump_mode == DumpMode.DRY_RUN and overwrite: + msg = ( + '`--dry-run` and `--overwrite` selected (or set in config). Overwrite operation will NOT be performed.' + ) + echo.echo_warning(msg) + + # --- Instantiate and Run ProfileDumper --- + profile_dumper = ProfileDumper(config=final_dump_config, output_path=dump_paths.top_level) + profile_dumper.dump() + + if final_dump_config.dump_mode != DumpMode.DRY_RUN: + msg = f'Raw files for profile `{profile.name}` dumped into folder `{dump_paths.child}`.' + echo.echo_success(msg) + else: + echo.echo_success('Dry run completed.') + + except NotExistent as e: + echo.echo_critical(f'Error: Required AiiDA entity not found: {e!s}') + except Exception as e: + msg = f'Unexpected error during dump of {profile.name}:\n ({e!s}).\n' + echo.echo_critical(msg + traceback.format_exc()) diff --git a/src/aiida/cmdline/params/options/main.py b/src/aiida/cmdline/params/options/main.py index c2ce719375..bd2aa0d73c 100644 --- a/src/aiida/cmdline/params/options/main.py +++ b/src/aiida/cmdline/params/options/main.py @@ -27,6 +27,7 @@ 'ALL', 'ALL_STATES', 'ALL_USERS', + 'ALSO_UNGROUPED', 'APPEND_TEXT', 'ARCHIVE_FORMAT', 'BROKER_HOST', @@ -53,13 +54,20 @@ 'DB_PORT', 'DB_USERNAME', 'DEBUG', + 'DELETE_MISSING', 'DESCRIPTION', 'DICT_FORMAT', 'DICT_KEYS', 'DRY_RUN', + 'DUMP_DATA', + 'DUMP_PROCESSES', + 'DUMP_UNSEALED', + 'END_DATE', 'EXIT_STATUS', 'EXPORT_FORMAT', 'FAILED', + 'FILTER_BY_LAST_DUMP_TIME', + 'FLAT', 'FORCE', 'FORMULA_MODE', 'FREQUENCY', @@ -68,7 +76,10 @@ 'GROUP_CLEAR', 'HOSTNAME', 'IDENTIFIER', - 'INCREMENTAL', + 'INCLUDE_ATTRIBUTES', + 'INCLUDE_EXTRAS', + 'INCLUDE_INPUTS', + 'INCLUDE_OUTPUTS', 'INPUT_FORMAT', 'INPUT_PLUGIN', 'LABEL', @@ -78,8 +89,11 @@ 'NODES', 'NON_INTERACTIVE', 'OLDER_THAN', + 'ONLY_TOP_LEVEL_CALCS', + 'ONLY_TOP_LEVEL_WORKFLOWS', 'ORDER_BY', 'ORDER_DIRECTION', + 'ORGANIZE_BY_GROUPS', 'OVERWRITE', 'PAST_DAYS', 'PATH', @@ -98,11 +112,14 @@ 'SCHEDULER', 'SILENT', 'SORT', + 'START_DATE', + 'SYMLINK_CALCS', 'TIMEOUT', 'TRAJECTORY_INDEX', 'TRANSPORT', 'TRAVERSAL_RULE_HELP_STRING', 'TYPE_STRING', + 'UPDATE_GROUPS', 'USER', 'USER_EMAIL', 'USER_FIRST_NAME', @@ -762,7 +779,7 @@ def set_log_level(ctx, _param, value): '--path', type=click.Path(path_type=pathlib.Path), show_default=False, - help='Base path for operations that write to disk.', + help='Base path for dump operations that write to disk.', ) OVERWRITE = OverridableOption( @@ -783,10 +800,146 @@ def set_log_level(ctx, _param, value): show_default=True, ) -INCREMENTAL = OverridableOption( - '--incremental/--no-incremental', +DUMP_DATA = OverridableOption( + '--dump-data/--no-dump-data', is_flag=True, + default=False, + show_default=True, + help='Dump data nodes.', +) + +DUMP_PROCESSES = OverridableOption( + '--dump-processes/--no-dump-processes', + is_flag=True, + default=True, + show_default=True, + help='Dump process data.', +) + +ORGANIZE_BY_GROUPS = OverridableOption( + '--organize-by-groups/--no-organize-by-groups', + default=True, + is_flag=True, + type=bool, + show_default=True, + help='If the collection of nodes to be dumped is organized in groups, reproduce its hierarchy.', +) + +INCLUDE_INPUTS = OverridableOption( + '--include-inputs/--exclude-inputs', + default=True, + show_default=True, + help='Include linked input nodes of `CalculationNode`(s).', +) + +INCLUDE_OUTPUTS = OverridableOption( + '--include-outputs/--exclude-outputs', + default=False, + show_default=True, + help='Include linked output nodes of `CalculationNode`(s).', +) + +INCLUDE_ATTRIBUTES = OverridableOption( + '--include-attributes/--exclude-attributes', + default=True, + show_default=True, + help='Include attributes in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', +) + +INCLUDE_EXTRAS = OverridableOption( + '--include-extras/--exclude-extras', + default=True, + show_default=True, + help='Include extras in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', +) + +FLAT = OverridableOption( + '-f', + '--flat', + is_flag=True, + default=False, + help='Dump files in a flat directory for every step of a workflow.', +) + +SYMLINK_CALCS = OverridableOption( + '--symlink-calcs/--no-symlink-calcs', + default=False, + show_default=True, + help='Symlink workflow sub-calculations to their own dedicated directories.', + # (must be used in conjunction with no-only-top-level-calcs) +) + +DELETE_MISSING = OverridableOption( + '--delete-missing/--no-delete-missing', default=True, show_default=True, - help="Incremental dumping of data to disk. Doesn't require using overwrite to clean previous directories.", + help='If a previously dumped group or node is deleted from the DB, delete the corresponding dump directory.', +) + +ALSO_UNGROUPED = OverridableOption( + '--also-ungrouped/--no-also-ungrouped', + default=False, + show_default=True, + help='Dump also data of nodes that are not part of any group.', +) + +ONLY_TOP_LEVEL_CALCS = OverridableOption( + '--only-top-level-calcs/--no-only-top-level-calcs', + default=True, + show_default=True, + help='Dump calculations in their own dedicated directories, not just as part of the dumped workflow.', +) + +ONLY_TOP_LEVEL_WORKFLOWS = OverridableOption( + '--only-top-level-workflows/--no-only-top-level-workflows', + default=True, + show_default=True, + help='If a top-level workflow calls sub-workflows, create a designated directory only for the top-level workflow.', +) + +UPDATE_GROUPS = OverridableOption( + '--update-groups/--no-update-groups', + default=True, + show_default=True, + help='Update directories if nodes have been added to other groups, or organized differently in terms of groups.', +) + +DUMP_UNSEALED = OverridableOption( + '--dump-unsealed/--no-dump-unsealed', + is_flag=True, + default=False, + show_default=True, + help='Also allow the dumping of unsealed process nodes.', +) + +FILTER_BY_LAST_DUMP_TIME = OverridableOption( + '--filter-by-last-dump-time/--no-filter-by-last-dump-time', + is_flag=True, + default=True, + show_default=True, + help='Only select nodes whose `mtime` is after the last dump time.', +) + +FILTER_BY_LAST_DUMP_TIME = OverridableOption( + '--filter-by-last-dump-time/--no-filter-by-last-dump-time', + is_flag=True, + default=True, + show_default=True, + help='Only select nodes whose `mtime` is after the last dump time.', +) + +START_DATE = OverridableOption( + '--start-date', + is_flag=False, + default=None, + show_default=True, + help='Start date for node mtime range selection for profile dumping.', +) + +END_DATE = OverridableOption( + '--end-date', + is_flag=False, + default=None, + show_default=True, + help='End date for node mtime range selection for profile dumping.', ) diff --git a/src/aiida/tools/dumping/__init__.py b/src/aiida/tools/dumping/__init__.py index a746fa171e..9244adcc08 100644 --- a/src/aiida/tools/dumping/__init__.py +++ b/src/aiida/tools/dumping/__init__.py @@ -6,6 +6,9 @@ # For further information on the license, see the LICENSE.txt file # # For further information please visit http://www.aiida.net # ########################################################################### -"""Modules related to the dumping of AiiDA data.""" +"""Public API for data dumping functionality.""" -__all__ = ('processes',) +from aiida.tools.dumping.config import DumpConfig, DumpMode +from aiida.tools.dumping.facades import GroupDumper, ProcessDumper, ProfileDumper + +__all__ = ('DumpConfig', 'DumpMode', 'GroupDumper', 'ProcessDumper', 'ProfileDumper') diff --git a/src/aiida/tools/dumping/config.py b/src/aiida/tools/dumping/config.py new file mode 100644 index 0000000000..3c4393fe86 --- /dev/null +++ b/src/aiida/tools/dumping/config.py @@ -0,0 +1,372 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +from __future__ import annotations + +from datetime import datetime +from enum import Enum, auto +from pathlib import Path +from typing import Annotated, Any, Dict, List, Optional, Type, Union + +import yaml +from pydantic import ( + BaseModel, + BeforeValidator, + ConfigDict, + Field, + field_validator, + model_serializer, + model_validator, +) + +from aiida import orm +from aiida.common.exceptions import NotExistent +from aiida.common.log import AIIDA_LOGGER +from aiida.orm import Code, Computer, User + + +class DumpMode(Enum): + INCREMENTAL = auto() + OVERWRITE = auto() + DRY_RUN = auto() + + +# TODO: See if this can be removed +class ProfileDumpSelection(Enum): + NONE = auto() + ALL = auto() + SPECIFIC = auto() + + +class GroupDumpScope(Enum): + IN_GROUP = auto() + ANY = auto() + NO_GROUP = auto() + + +logger = AIIDA_LOGGER.getChild('tools.dumping.config') + + +def _load_computer_validator(v: Any) -> orm.Computer | None: + """Pydantic validator function to load Computer from identifier.""" + if v is None or isinstance(v, orm.Computer): + return v + if isinstance(v, str): + try: + return orm.load_computer(identifier=v) + except NotExistent: + logger.warning(f"Computer with identifier '{v}' not found in DB. Returning None for this item.") + return None + except Exception as e: + logger.error(f"Error loading Computer '{v}': {e}. Returning None for this item.") + return None + logger.warning(f'Invalid input type for computer validation: {type(v)}. Returning None.') + return None + + +def _load_code_validator(v: Any) -> orm.Code | None: + """Pydantic validator function to load Code from identifier.""" + if v is None or isinstance(v, orm.Code): + return v + if isinstance(v, str): + try: + node = orm.load_node(identifier=v) + if isinstance(node, orm.Code): + return node + else: + logger.warning(f"Node identifier '{v}' does not correspond to a Code. Returning None for this item.") + return None + except NotExistent: + logger.warning(f"Code with identifier '{v}' not found in DB. Returning None for this item.") + return None + except Exception as e: + logger.error(f"Error loading Code '{v}': {e}. Returning None for this item.") + return None + logger.warning(f'Invalid input type for code validation: {type(v)}. Returning None.') + return None + + +# Define Annotated types to apply the validators to list items +ComputerOrNone = Annotated[Optional[orm.Computer], BeforeValidator(_load_computer_validator)] +CodeOrNone = Annotated[Optional[orm.Code], BeforeValidator(_load_code_validator)] + + +class DumpConfig(BaseModel): + """ + Unified Pydantic configuration for dump operations. + Handles serialization/deserialization to/from Click-option-like keys. + """ + + # --- Model Configuration --- + model_config = ConfigDict( + arbitrary_types_allowed=True, + validate_assignment=True, + ) + + groups: Optional[List[Union[str, orm.Group]]] = Field(default=None, description='Groups to dump (UUIDs or labels)') + start_date: Optional[datetime] = Field(default=None, description='Start date/time for modification time filter') + end_date: Optional[datetime] = Field(default=None, description='End date/time for modification time filter') + past_days: Optional[int] = Field(default=None, description='Number of past days to include based on mtime.') + + user: Optional[Union[User, str]] = Field(default=None, description='User object or email to filter by') + + computers: Optional[List[Union[Computer, str]]] = Field( + default=None, description='List of Computer objects or UUIDs/labels to filter by' + ) + + codes: Optional[List[Union[Code, str]]] = Field( + default=None, description='List of Code objects or UUIDs/labels to filter by' + ) + + # --- Global options --- + dump_mode: DumpMode = DumpMode.INCREMENTAL + + # --- Node collection options --- + dump_processes: bool = True + dump_data: bool = False + filter_by_last_dump_time: bool = True + only_top_level_calcs: bool = True + only_top_level_workflows: bool = True + group_scope: GroupDumpScope = Field( + default=GroupDumpScope.IN_GROUP, + exclude=True, # Exclude from standard serialization, internal class + ) + + # --- Process dump options --- + include_inputs: bool = True + include_outputs: bool = False + include_attributes: bool = True + include_extras: bool = False + flat: bool = False + dump_unsealed: bool = False + symlink_calcs: bool = False + + # --- Group/Profile options --- + delete_missing: bool = True + organize_by_groups: bool = True + also_ungrouped: bool = False + update_groups: bool = True + profile_dump_selection: ProfileDumpSelection = Field( + default=ProfileDumpSelection.NONE, + exclude=True, # Exclude from standard serialization, internal class + ) + + # --- Pydantic Field Validators --- + @field_validator('groups', mode='before') + @classmethod + def _validate_groups_input(cls, v: Any) -> Optional[List[str]]: + """ + Validate and transform the input for the 'groups' field. + Accepts a list containing orm.Group objects or strings (labels/UUIDs), + and converts all elements to strings (using group label). + """ + if v is None: + return None + if not isinstance(v, list): + # According to the error, a list is expected. + # If other types are possible at this stage from Click, adjust as needed. + raise ValueError(f'Invalid input type for groups: {type(v)}. Expected a list.') + + processed_groups: List[str] = [] + for item_idx, item in enumerate(v): + if isinstance(item, orm.Group): + # Using group's label as the string representation. + # Change to item.uuid if UUIDs are preferred. + processed_groups.append(item.label) + elif isinstance(item, str): + processed_groups.append(item) + else: + msg = ( + f"Invalid item type in 'groups' list at index {item_idx}: {type(item)}. " + 'Expected an AiiDA Group object or a string (label/UUID).' + ) + raise ValueError(msg) + # Return None if list is empty to match Field(default=None) behavior if desired, + # or always return the list (Pydantic will handle default if input is None). + return processed_groups if processed_groups else None + + @field_validator('user', mode='before') + def _validate_user(cls, v: Any) -> User | None: # noqa: N805 + """Load User object from email string.""" + if v is None or isinstance(v, orm.User): + return v + if isinstance(v, str): + try: + return orm.User.collection.get(email=v) + except NotExistent: + logger.warning(f"User with email '{v}' not found in DB. Returning None.") + return None + except Exception as e: + logger.error(f"Error loading User '{v}': {e}. Returning None.") + return None + # Raise error for completely invalid input types during validation + msg = f'Invalid input type for user: {type(v)}. Expected email string or User object.' + raise ValueError(msg) + + @model_validator(mode='after') + def _check_date_filters(self) -> 'DumpConfig': + """Ensure past_days is not used with start_date or end_date.""" + if self.past_days is not None and (self.start_date is not None or self.end_date is not None): + msg = 'Cannot use `past_days` filter together with `start_date` or `end_date`.' + raise ValueError(msg) + return self + + @model_validator(mode='before') + @classmethod + def _map_click_options_to_internal(cls, values: Dict[str, Any]) -> Dict[str, Any]: + """Map incoming Click-option-like keys to internal representation.""" + # Handle Dump Mode + if values.pop('dry_run', False): + values['dump_mode'] = DumpMode.DRY_RUN + elif values.pop('overwrite', False): + values['dump_mode'] = DumpMode.OVERWRITE + + # Handle Filters: map keys and determine if specific filters were set + # No need to map 'groups' anymore as field name matches + filter_map_from_option = { + # "groups": "groups", # No longer needed + 'user': 'user', + 'codes': 'codes', + 'computers': 'computers', + 'start_date': 'start_date', + 'end_date': 'end_date', + } + has_specific_filters = False + # Check original Click option names + 'groups' which now matches + click_filter_keys = list(filter_map_from_option.keys()) + ['groups'] + for key in click_filter_keys: + if key in values and values[key] is not None: + has_specific_filters = True + # Move value if key names differ (only for start/end_date now) + target_field = filter_map_from_option.get(key, key) + if key != target_field: + values[target_field] = values.pop(key) + + # Determine Profile Scope + all_entries_set = values.get('all_entries', False) # Default to False if key missing + + if all_entries_set: + values['profile_dump_selection'] = ProfileDumpSelection.ALL + else: + # Check for specific filters using internal field names + specific_filter_fields = {'groups', 'user', 'codes', 'computers', 'start_date', 'end_date', 'past_days'} + has_specific_filters = any( + field in values and values[field] is not None and values[field] != [] + for field in specific_filter_fields + ) + + if has_specific_filters: + values['profile_dump_selection'] = ProfileDumpSelection.SPECIFIC + # Default to NONE only if not set otherwise + elif 'profile_dump_selection' not in values: + values['profile_dump_selection'] = ProfileDumpSelection.NONE + return values + + # --- Serialization override --- + @model_serializer(mode='wrap', when_used='json') + def _serialize_to_click_options(self, handler): + """Serialize to dict with Click-option-like keys.""" + # Exclude internal fields before handler runs if possible, + # otherwise remove them after. `exclude=True` on Field helps here. + data = handler(self) + + # Add Click keys based on internal fields + # 1. Dump Mode -> flags + # Access internal fields via self + data['dry_run'] = self.dump_mode == DumpMode.DRY_RUN + data['overwrite'] = self.dump_mode == DumpMode.OVERWRITE + data.pop('dump_mode', None) + + # 2. Profile Scope -> all_entries flag + data['all_entries'] = self.profile_dump_selection == ProfileDumpSelection.ALL + + # 3. Filter fields -> option names & identifier serialization + # No need to map 'groups' name + filter_map_to_option = { + 'groups': 'groups', # Keep mapping for iteration logic + 'user': 'user', + 'codes': 'codes', + 'computers': 'computers', + 'start_date': 'start_date', + 'end_date': 'end_date', + } + orm_fields_map = {'user': 'email', 'computers': 'uuid', 'codes': 'uuid'} + + # Iterate through *potential* output keys, get value from self + for field_name, option_name in filter_map_to_option.items(): + value = getattr(self, field_name, None) + if value is not None: + # Serialize value if needed (dates, ORM objects) + if isinstance(value, datetime): + data[option_name] = value.isoformat() + elif field_name in orm_fields_map: + orm_attr = orm_fields_map[field_name] + if isinstance(value, list): # Handle list of ORM objects + serialized_list = [getattr(item, orm_attr, None) for item in value if hasattr(item, orm_attr)] + # Only include if list is not empty after serialization + if serialized_list: + data[option_name] = [item for item in serialized_list if item is not None] + elif hasattr(value, orm_attr): # Handle single ORM object + data[option_name] = getattr(value, orm_attr) + elif isinstance(value, list) and value: # Handle non-empty basic lists (like groups) + data[option_name] = value + elif not isinstance(value, list): # Handle other non-list, non-None values + data[option_name] = value + + # Clean up empty lists potentially created if ORM serialization failed + if option_name in data and isinstance(data[option_name], list) and not data[option_name]: + data.pop(option_name) + + # Remove None values explicitly if they remain + data = {k: v for k, v in data.items() if v is not None} + + return data + + # --- File Handling Methods --- + @classmethod + def parse_yaml_file(cls: Type['DumpConfig'], path: str | Path) -> 'DumpConfig': + """Loads configuration from a YAML file using Pydantic validation.""" + # (Implementation remains the same) + file_path = Path(path) + if not file_path.is_file(): + logger.error(f'Configuration file not found: {file_path}') + raise FileNotFoundError(f'Configuration file not found: {file_path}') + logger.info(f'Loading configuration from YAML: {file_path}') + try: + with file_path.open('r', encoding='utf-8') as f: + config_data = yaml.safe_load(f) or {} + instance = cls.model_validate(config_data) + logger.info('Successfully validated configuration from file.') + return instance + except Exception as e: + logger.error( + f'Failed to load or validate config file {file_path}: {e}', + exc_info=True, + ) + raise ValueError(f'Failed to load/validate configuration from {file_path}') from e + + def _save_yaml_file(self, path: str | Path) -> None: + """Saves the configuration to a YAML file.""" + # (Implementation remains the same) + file_path = Path(path) + logger.info(f'Saving configuration to YAML: {file_path}') + try: + file_path.parent.mkdir(parents=True, exist_ok=True) + config_dict = self.model_dump(mode='json') # Triggers serializer + with file_path.open('w', encoding='utf-8') as f: + yaml.dump(config_dict, f, indent=4, default_flow_style=False) + logger.info(f'Configuration saved successfully to {file_path}') + except Exception as e: + logger.error(f'Failed to save configuration to {file_path}: {e}', exc_info=True) + raise IOError(f'Failed to save configuration to {file_path}') from e + + +# --- IMPORTANT: Finalize Pydantic Model --- +# Call model_rebuild() after the class definition to resolve forward references +# and build the final schema needed for validation/serialization. +DumpConfig.model_rebuild(force=True) diff --git a/src/aiida/tools/dumping/detect.py b/src/aiida/tools/dumping/detect.py new file mode 100644 index 0000000000..30d0f5e3d4 --- /dev/null +++ b/src/aiida/tools/dumping/detect.py @@ -0,0 +1,838 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Class to collect nodes for dump feature.""" + +from __future__ import annotations + +from datetime import datetime, timedelta +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type, cast + +from aiida import orm +from aiida.common.exceptions import NotExistent +from aiida.common.log import AIIDA_LOGGER +from aiida.orm import QueryBuilder +from aiida.tools.dumping.config import GroupDumpScope +from aiida.tools.dumping.mapping import GroupNodeMapping +from aiida.tools.dumping.utils.helpers import ( + DumpNodeStore, + DumpStoreKeys, + DumpTimes, + GroupChanges, + GroupInfo, + GroupRenameInfo, + NodeChanges, + NodeMembershipChange, +) +from aiida.tools.dumping.utils.paths import DumpPaths + +if TYPE_CHECKING: + from aiida.orm import Group, Node, QueryBuilder + from aiida.tools.dumping.config import DumpConfig + from aiida.tools.dumping.logger import DumpLogger + +__all__ = ('DumpChangeDetector', 'DumpNodeQuery') + +logger = AIIDA_LOGGER.getChild('tools.dumping.detect') + + +class DumpChangeDetector: + """Detects changes in the database since the last dump""" + + def __init__(self, dump_logger: DumpLogger, config: DumpConfig, dump_times: DumpTimes) -> None: + """ + Initializes the DumpChangeDetector. + + Args: + dump_logger: The logger instance holding data from the previous dump. + config: The current dump configuration. + dump_times: Object holding relevant timestamps for the current dump. + """ + self.dump_logger: DumpLogger = dump_logger + self.config: DumpConfig = config + self.dump_times: DumpTimes = dump_times + # Instantiate the new query handler + self.node_query = DumpNodeQuery(config) + # Cache grouped node UUIDs to avoid rebuilding mapping multiple times per run + self._grouped_node_uuids_cache: set[str] | None = None + + def _get_all_grouped_node_uuids(self) -> set[str]: + """Gets and caches the set of UUIDs for all nodes in any group.""" + if self._grouped_node_uuids_cache is None: + logger.debug('Building and caching grouped node UUID set...') + try: + mapping = GroupNodeMapping.build_from_db() + # Union of all sets of node UUIDs from the group_to_nodes mapping + self._grouped_node_uuids_cache = set().union(*mapping.group_to_nodes.values()) + logger.debug(f'Cached {len(self._grouped_node_uuids_cache)} grouped node UUIDs.') + except Exception as e: + logger.error(f'Failed to build group mapping for caching: {e}', exc_info=True) + # Cache empty set on error to avoid repeated attempts within the same run + self._grouped_node_uuids_cache = set() + return self._grouped_node_uuids_cache + + def _query_initial_candidates(self, scope: GroupDumpScope, group: Optional[Group] = None) -> dict[str, list[Node]]: + """Query broad candidate nodes using the unified DumpNodeQuery.""" + raw_nodes: dict[str, list[Node]] = { + 'workflows': [], + 'calculations': [], + 'data': [], + } + nodes_to_query: list[tuple[type[Node], str]] = [] + + # Determine which node types to query based on config + if self.config.dump_processes: + nodes_to_query.extend([(orm.WorkflowNode, 'workflows'), (orm.CalculationNode, 'calculations')]) + if self.config.dump_data: + logger.warning('Data node detection not implemented.') + + # Resolve base time filters ONCE before looping + base_filters = self.node_query._resolve_time_filters( + orm.Node, # Use base Node type for generic time filter resolving + dump_times=self.dump_times, + include_time_filter=self.config.filter_by_last_dump_time, + ) + + for orm_type, store_key in nodes_to_query: + logger.debug(f'Querying candidate nodes of type {orm_type.__name__} with scope {scope.name}...') + try: + # Use the unified querier + nodes = self.node_query._get_nodes( + orm_type=orm_type, + dump_times=self.dump_times, + scope=scope, + group=group, + base_filters=base_filters, # Pass pre-resolved base filters + ) + logger.debug( + f'Query returned {len(nodes)} candidate nodes for {store_key} (scope: {scope.name}, pre-filtering).' + ) + raw_nodes[store_key] = nodes + except Exception as e: + logger.warning(f'Failed to query candidate nodes for {store_key} (scope: {scope.name}): {e}') + return raw_nodes + + def _apply_logged_status_filter(self, raw_nodes: dict[str, list[Node]]) -> dict[str, list[Node]]: + """Filter out nodes that are already present in the dump log. + + :param raw_nodes: _description_ + :return: _description_ + """ + + logger.debug('Applying logged status filter...') + logged_filtered_nodes: dict[str, list[Node]] = { + 'workflows': [], + 'calculations': [], + 'data': [], + } + nodes_removed_by_log_filter = 0 + for store_key, nodes in raw_nodes.items(): + if not nodes: + continue + try: + # Get the appropriate log store (calculations, workflows, etc.) + log_store = self.dump_logger.get_store_by_name(store_key) # type: ignore[arg-type] + logged_uuids = set(log_store.entries.keys()) + + if not logged_uuids: # If log store is empty, keep all nodes + filtered_list = nodes + else: + # Keep nodes whose UUIDs are NOT in the logged set + filtered_list = [node for node in nodes if node.uuid not in logged_uuids] + + logged_filtered_nodes[store_key] = filtered_list + removed_count = len(nodes) - len(filtered_list) + if removed_count > 0: + logger.debug(f'Removed {removed_count} already logged nodes from {store_key}.') + nodes_removed_by_log_filter += removed_count + except ValueError as e: # Catch potential errors from get_store_by_name + logger.error(f"Error getting log store for key '{store_key}': {e}") + logged_filtered_nodes[store_key] = nodes # Keep original nodes on error + except Exception as e: + logger.error( + f"Unexpected error applying log filter for '{store_key}': {e}", + exc_info=True, + ) + logged_filtered_nodes[store_key] = nodes # Keep original nodes on error + + logger.debug(f'Removed {nodes_removed_by_log_filter} total nodes already present in log.') + return logged_filtered_nodes + + def _apply_top_level_filter(self, logged_filtered_nodes: dict[str, list[Node]]) -> dict[str, list[Node]]: + """Apply the top-level calculation/workflow filter. + + If calculations or workflows are explicitly part of a group, they are kept, + even if they are sub-calculations of a workflow. + + :param logged_filtered_nodes: Dictionary containing lists of 'workflows' and 'calculations'. + :type logged_filtered_nodes: dict[str, list[Node]] + :return: Dictionary with nodes filtered based on top-level status or grouping. + :rtype: dict[str, list[Node]] + """ + logger.debug('Applying top-level status filter...') + final_filtered_nodes: dict[str, list[Node]] = { + 'workflows': [], + 'calculations': [], + } + nodes_removed_by_top_level_filter = 0 + + # Get grouped node UUIDs (use cache) only if needed + # Check if any filtering is actually enabled before fetching UUIDs + needs_group_check = self.config.only_top_level_workflows or self.config.only_top_level_calcs + all_grouped_node_uuids: set[str] = set() + if needs_group_check: + all_grouped_node_uuids = self._get_all_grouped_node_uuids() + + # --- Define the inner function --- + def _filter_nodes_by_caller( + node_list: list[Node], node_type: str, grouped_uuids: set[str] + ) -> tuple[list[Node], int]: + """Filter nodes keeping only top-level ones or those explicitly grouped. + + :param node_list: List of nodes to filter. + :param node_type: Type of node ("workflows" or "calculations") for logging. + :param grouped_uuids: Set of UUIDs for nodes that are explicitly grouped. + :return: A tuple containing the filtered list of nodes and the count of removed nodes. + :rtype: tuple[list[TNode], int] + """ + filtered_nodes: list[Node] = [] + original_count = len(node_list) + + for node in node_list: + try: + # Check if node has a caller (i.e., is sub-node) + # Use getattr with default to avoid exception if 'caller' doesn't exist + is_sub_node = bool(getattr(node, 'caller', None)) + except Exception: + # Log specific exceptions if possible, but catch broad for robustness + msg = ( + f'Could not check caller for {node_type[:-1]} {getattr(node, "pk", "N/A")}.' + 'Assuming top-level. Error: {e}' + ) + logger.warning(msg) + is_sub_node = False # Assume top-level if check fails + + is_explicitly_grouped = node.uuid in grouped_uuids + + # Keep if: not a sub-node OR is explicitly grouped + if not is_sub_node or is_explicitly_grouped: + filtered_nodes.append(node) + + removed_count = original_count - len(filtered_nodes) + if removed_count > 0: + logger.debug(f'Removed {removed_count} non-top-level, non-grouped {node_type}.') + + # Return both the filtered list and the count of removed items + return filtered_nodes, removed_count + + # --- Filter Workflows --- + wf_list = logged_filtered_nodes.get('workflows', []) + if self.config.only_top_level_workflows and wf_list: + # Call the modified inner function and unpack the results + filtered_wfs, removed_wfs = _filter_nodes_by_caller(wf_list, 'workflows', all_grouped_node_uuids) + final_filtered_nodes['workflows'] = filtered_wfs + # Accumulate the count + nodes_removed_by_top_level_filter += removed_wfs + else: + # If no filtering applied, assign the original list + final_filtered_nodes['workflows'] = wf_list + + # --- Filter Calculations --- + calc_list = logged_filtered_nodes.get('calculations', []) + if self.config.only_top_level_calcs and calc_list: + # Call the modified inner function and unpack the results + filtered_calcs, removed_calcs = _filter_nodes_by_caller(calc_list, 'calculations', all_grouped_node_uuids) + final_filtered_nodes['calculations'] = filtered_calcs + # Accumulate the count + nodes_removed_by_top_level_filter += removed_calcs + else: + # If no filtering applied, assign the original list + final_filtered_nodes['calculations'] = calc_list + + logger.debug(f'Removed {nodes_removed_by_top_level_filter} total nodes by top-level filter.') + return final_filtered_nodes + + def _detect_new_nodes(self, scope: GroupDumpScope, group: Optional[Group] = None) -> DumpNodeStore: + """Detect new/modified nodes for a given scope, applying post-query filters. + + :param scope: _description_ + :param group: _description_, defaults to None + :return: _description_ + """ + logger.debug(f'Detecting new/modified nodes with scope {scope.name}...') + final_node_store = DumpNodeStore() + + # 1. Query initial candidates using the unified querier and scope + raw_nodes = self._query_initial_candidates(scope, group) + + # 2. Apply logged status filter + logged_filtered_nodes = self._apply_logged_status_filter(raw_nodes) + + # 3. Apply top-level filter (with exception for grouped calcs) + final_filtered_nodes = self._apply_top_level_filter(logged_filtered_nodes) + + # 4. Populate final Node Store + final_node_store.workflows = final_filtered_nodes.get('workflows', []) + final_node_store.calculations = final_filtered_nodes.get('calculations', []) + final_node_store.data = final_filtered_nodes.get('data', []) + + wf_count = len(final_node_store.workflows) + calc_count = len(final_node_store.calculations) + data_count = len(final_node_store.data) + logger.debug( + f'Finished detecting new/modified nodes (scope {scope.name}). Final counts: ' + f'Workflows={wf_count}, Calculations={calc_count}, Data={data_count}.' + ) + return final_node_store + + def _detect_deleted_nodes(self) -> set[str]: + """Detect nodes deleted from DB since last dump.""" + logger.debug('Detecting deleted nodes...') + deleted_node_uuids: set[str] = set() + + # Iterate through the ORM types we might have logged + for orm_type in ( + orm.CalculationNode, + orm.WorkflowNode, + orm.Data, + ): + store_name = DumpStoreKeys.from_class(orm_class=orm_type) + try: + dump_store = self.dump_logger.get_store_by_name(name=store_name) + if not dump_store: + # Store might not exist if no nodes of this type were ever logged + continue + + dumped_uuids = set(dump_store.entries.keys()) + if not dumped_uuids: # Skip if no nodes of this type were logged + continue + + # Query for existing UUIDs of this type in the database + qb = QueryBuilder() + qb.append(orm_type, project=['uuid']) + all_db_uuids_for_type = set(qb.all(flat=True)) + + # Find UUIDs that were logged but are no longer in the DB + missing_uuids = dumped_uuids - all_db_uuids_for_type + if missing_uuids: + logger.debug(f'Detected {len(missing_uuids)} deleted nodes of type {orm_type.__name__}') + deleted_node_uuids.update(missing_uuids) + except ValueError as e: # Catch potential errors from get_store_by_name + logger.error(f"Error accessing log store for type '{orm_type.__name__}': {e}") + except Exception as e: + logger.error( + f"Unexpected error detecting deleted nodes for '{orm_type.__name__}': {e}", + exc_info=True, + ) + + logger.debug(f'Total deleted node UUIDs detected: {len(deleted_node_uuids)}') + return deleted_node_uuids + + def _detect_new_groups(self, current_mapping: GroupNodeMapping) -> list[GroupInfo]: + """Identifies all groups in the current mapping as 'new' with their labels.""" + logger.debug('Detecting initial set of all groups as new.') + + new_groups = [] + for group_uuid, node_uuids in current_mapping.group_to_nodes.items(): + # Load the group to get its label + try: + group = orm.load_group(group_uuid) + group_label = group.label + except Exception as e: + logger.warning(f'Failed to load group {group_uuid}: {e}') + group_label = None + + # Create GroupInfo with label + group_info = GroupInfo(uuid=group_uuid, node_count=len(node_uuids), label=group_label) + new_groups.append(group_info) + + return new_groups + + def _detect_group_changes( + self, + stored_mapping: GroupNodeMapping | None, + current_mapping: GroupNodeMapping, + specific_group_uuid: str | None = None, + ) -> GroupChanges: + """Detect changes between stored and current group mappings. + + :param stored_mapping: _description_ + :param current_mapping: _description_ + :param specific_group_uuid: _description_, defaults to None + :return: _description_ + """ + logger.debug('Calculating group changes diff...') + + group_changes: GroupChanges + + # --- Calculate initial diff based on membership --- + if stored_mapping is None: + # If no previous mapping, consider all current groups as new + new_groups = self._detect_new_groups(current_mapping) + group_changes = GroupChanges(new=new_groups) + logger.debug(f'Initial group detection: Found {len(group_changes.new)} groups.') + else: + try: + # Calculate the difference using the mapping's diff method + group_changes = stored_mapping.diff(current_mapping) + logger.debug( + f'Group mapping diff calculated: {len(group_changes.new)} new, ' + f'{len(group_changes.deleted)} deleted, {len(group_changes.modified)} modified.' + ) + except Exception as e: + logger.error(f'Error calculating group mapping diff: {e}', exc_info=True) + return GroupChanges() + + # --- Detect Renames (only if stored_mapping exists) --- + if stored_mapping: + self_group_uuids = set(stored_mapping.group_to_nodes.keys()) + other_group_uuids = set(current_mapping.group_to_nodes.keys()) + common_group_uuids = self_group_uuids & other_group_uuids + + logger.debug(f'Checking {len(common_group_uuids)} common groups for renames...') + for group_uuid in common_group_uuids: + try: + # Get old path from logger + old_path_abs = self.dump_logger.get_dump_path_by_uuid(group_uuid) + if not old_path_abs: + logger.debug(f'Could not find old path for common group UUID {group_uuid} in logger.') + continue + + # Get current group info from DB + current_group = orm.load_group(uuid=group_uuid) + current_label = current_group.label + + # Calculate expected current path based on current label + # Use the absolute path from DumpLogger's perspective + # Need dump_paths from the logger instance + organize_by_groups = self.config.organize_by_groups # Use config from detector + current_path_rel = DumpPaths._get_group_path(current_group, organize_by_groups) + current_path_abs = self.dump_logger.dump_paths.absolute / current_path_rel + + # Compare old path with expected current path + if old_path_abs.resolve() != current_path_abs.resolve(): + msg = ( + f'Detected rename for group UUID {group_uuid}: ' + f"Old path '{old_path_abs.name}', New path '{current_path_abs.name}' " + f"(New label: '{current_label}')" + ) + logger.info(msg) + group_changes.renamed.append( + GroupRenameInfo( + uuid=group_uuid, + old_path=old_path_abs, + new_path=current_path_abs, + new_label=current_label, + ) + ) + + except NotExistent: + # Should not happen for common UUIDs, but handle defensively + logger.error(f'Could not load group with common UUID {group_uuid} from DB.') + except Exception as e: + logger.error(f'Error checking rename for group UUID {group_uuid}: {e}', exc_info=True) + + # If a specific group is requested, filter the results + if specific_group_uuid: + logger.debug(f'Filtering group changes for specific group UUID: {specific_group_uuid}') + return self._filter_group_changes_for_group(group_changes, specific_group_uuid) + else: + return group_changes + + # TODO: Maybe allow for multiple groups? + def _detect_all_changes(self, group: Optional[Group] = None) -> tuple[NodeChanges, GroupNodeMapping]: + """Detect all node and group changes relevant for a dump operation. + + Orchestrates calls to more specific detection methods. + + :param group: _description_, defaults to None + :return: _description_ + """ + logger.info('Starting change detection...') + # Clear grouped UUID cache at the start of each full detection run + self._grouped_node_uuids_cache = None + + # --- Get Current Mapping --- + try: + current_group_mapping = GroupNodeMapping.build_from_db() + logger.debug('Successfully built current group-node mapping from DB.') + except Exception as e: + logger.error(f'Failed to build current group-node mapping: {e}', exc_info=True) + current_group_mapping = GroupNodeMapping() + + # --- Determine Scope for Node Detection (Assign Enum member) --- + node_detection_scope: GroupDumpScope + if group is not None: + node_detection_scope = GroupDumpScope.IN_GROUP + logger.debug(f"Node detection scope set to '{node_detection_scope.name}' for group {group.label}") + elif self.config.group_scope == GroupDumpScope.NO_GROUP: + node_detection_scope = GroupDumpScope.NO_GROUP + logger.debug(f"Node detection scope set to '{node_detection_scope.name}'") + else: + node_detection_scope = GroupDumpScope.ANY + logger.debug(f"Node detection scope set to '{node_detection_scope.name}'") + + # --- Detect Node Changes --- + try: + # Call detect_new_nodes with the determined scope Enum member + new_nodes_store: DumpNodeStore = self._detect_new_nodes( + scope=node_detection_scope, + group=group, # Pass the specific group if scope is IN_GROUP + ) + except Exception as e: + logger.error(f'Error detecting new/modified nodes: {e}', exc_info=True) + new_nodes_store = DumpNodeStore() + + try: + deleted_node_uuids: set[str] = self._detect_deleted_nodes() + except Exception as e: + logger.error(f'Error detecting deleted nodes: {e}', exc_info=True) + deleted_node_uuids = set() + + node_changes = NodeChanges( + new_or_modified=new_nodes_store, + deleted=deleted_node_uuids, + ) + + # --- Return NodeChanges and current mapping --- + # TODO: Maybe centralize also calculating the group changes here? + # Group changes are calculated later by the Engine using the mappings + logger.info('Change detection finished.') + return node_changes, current_group_mapping + + def _filter_group_changes_for_group(self, changes: GroupChanges, group_uuid: str) -> GroupChanges: + """Filter GroupChangeInfo results for a specific group. + + :param changes: _description_ + :param group_uuid: _description_ + :return: _description_ + """ + logger.debug(f'Filtering GroupChangeInfo for group UUID: {group_uuid}') + # Create a new GroupChanges object to hold the filtered results + filtered_changes = GroupChanges( + deleted=[g for g in changes.deleted if g.uuid == group_uuid], + new=[g for g in changes.new if g.uuid == group_uuid], + modified=[g for g in changes.modified if g.uuid == group_uuid], + node_membership={}, # Initialize as empty dict + ) + + # Filter node membership changes + for node_uuid, membership in changes.node_membership.items(): + involved_in_this_group = False + # Create a new NodeMembershipChange for the filtered result + filtered_membership = NodeMembershipChange() + + # Check if the specific group is in the added list for this node + if group_uuid in membership.added_to: + filtered_membership.added_to.append(group_uuid) + involved_in_this_group = True + + # Check if the specific group is in the removed list for this node + if group_uuid in membership.removed_from: + filtered_membership.removed_from.append(group_uuid) + involved_in_this_group = True + + # If the node's membership changed with respect to this specific group, add it + if involved_in_this_group: + filtered_changes.node_membership[node_uuid] = filtered_membership + + filtered_renamed = [r for r in changes.renamed if r.uuid == group_uuid] + filtered_changes.renamed = filtered_renamed + + logger.debug(f'Filtered group changes: {filtered_changes}') + return filtered_changes + + @staticmethod + def _get_calculation_descendants( + workflows: list[orm.WorkflowNode], + ) -> list[orm.CalculationNode]: + """Get CalculationNode descendants of the provided workflows. + + :param workflows: _description_ + :return: _description_ + """ + descendants: list[orm.CalculationNode] = [] + for workflow in workflows: + try: + # Use the `called_descendants` property which handles the traversal + descendants.extend( + node for node in workflow.called_descendants if isinstance(node, orm.CalculationNode) + ) + except Exception as e: + logger.warning(f'Could not get descendants for workflow {workflow.pk}: {e}') + # Ensure uniqueness using UUIDs as keys in a dict + unique_descendants = list(set(descendants)) + logger.debug(f'Found {len(unique_descendants)} unique calculation descendants for {len(workflows)} workflows.') + return unique_descendants + + +class DumpNodeQuery: + """Builds and executes database queries to find nodes for dumping.""" + + # Tags used in QueryBuilder for consistency + NODE_TAG = 'node' + GROUP_TAG = 'group_filter' + USER_TAG = 'user_filter' + COMPUTER_TAG = 'computer_filter' + CODE_TAG = 'code_filter' + + def __init__(self, config: DumpConfig): + self.config = config + + def _get_nodes( + self, + orm_type: Type[Node], + dump_times: DumpTimes, + scope: GroupDumpScope = GroupDumpScope.ANY, + group: Optional[Group] = None, + base_filters: Optional[Dict] = None, + ignore_time_filter: bool = False, + ) -> List[Node]: + """Query nodes based on the specified type, time, scope, and filters. + + :param orm_type: The AiiDA ORM Node class to query (e.g., CalculationNode). + :param dump_times: Object containing relevant timestamps for filtering. + :param scope: Determines the query scope (ANY, IN_GROUP, NO_GROUP), defaults to GroupDumpScope.ANY + :param group: The specific group to filter by when scope is IN_GROUP, defaults to None + :param base_filters: Pre-resolved base filters (e.g., time). + If None, they will be resolved based on dump_times and config, defaults to None + :raises ValueError: If scope is IN_GROUP but no group is provided. + :return: A list of matching Node instances. + """ + logger.debug(f"Getting nodes for type {orm_type.__name__} with scope '{scope.name}'...") + + # 1. Resolve base filters (time, etc.) if not provided + if base_filters is None: + include_time_filter = self.config.filter_by_last_dump_time + resolved_filters = self._resolve_time_filters( + orm_type=orm_type, + dump_times=dump_times, + include_time_filter=include_time_filter, + ignore_time_filter=ignore_time_filter, + ) + else: + resolved_filters = base_filters.copy() + + # 2. Modify filters based on scope + if scope == GroupDumpScope.IN_GROUP: + if group is None: + raise ValueError('Scope is IN_GROUP but no group object was provided.') + # Group filtering handled by relationships + elif scope == GroupDumpScope.NO_GROUP: + grouped_node_uuids = self._query_grouped_node_uuids(orm_type) + if grouped_node_uuids: + if 'uuid' not in resolved_filters: + resolved_filters['uuid'] = {} + resolved_filters['uuid']['!in'] = list(grouped_node_uuids) + logger.debug(f'Query: Adding filter to exclude {len(grouped_node_uuids)} grouped nodes.') + # No specific filter needed for scope == GroupDumpScope.ANY + + # --- Build Query --- + qb = orm.QueryBuilder() + relationships: Dict[str, Any] = {} + + # 3. Append Group filter if scope is IN_GROUP + if scope == GroupDumpScope.IN_GROUP and group: + qb.append(orm.Group, filters={'uuid': group.uuid}, tag=self.GROUP_TAG) + relationships['with_group'] = self.GROUP_TAG + + # 4. Append related entity filters (User, Computer, Code) + qb, entity_relationships = self._resolve_qb_appends(qb, orm_type) + relationships.update(entity_relationships) + + # Add edge filter specifically for Code links if Code was appended + # NOTE: Not sure if this is needed? + if 'with_incoming' in relationships and relationships['with_incoming'] == self.CODE_TAG: + relationships['edge_filters'] = {'label': 'code'} + + # 5. Append the main node type with combined filters and relationships + qb.append(orm_type, filters=resolved_filters, tag=self.NODE_TAG, **relationships) + # --- End Build Query --- + + # 6. Execute query using shared helper + return self._execute_query(qb, orm_type.__name__, scope, group) + + def _query_grouped_node_uuids(self, orm_type: Type[Node]) -> Set[str]: + """Helper to query UUIDs of nodes of orm_type present in any group. + + :param orm_type: _description_ + :return: _description_ + """ + logger.debug(f'Querying grouped node UUIDs for {orm_type.__name__}...') + qb_in_group = orm.QueryBuilder() + qb_in_group.append(orm.Group, tag='g_sub') + qb_in_group.append(orm_type, with_group='g_sub', project='uuid', tag='n_in_g') + grouped_uuids = set(qb_in_group.all(flat=True)) + logger.debug(f'Found {len(grouped_uuids)} grouped nodes of type {orm_type.__name__}.') + return grouped_uuids + + def _resolve_time_filters( + self, + orm_type: Type[Node], + dump_times: DumpTimes, + include_time_filter: bool = True, + ignore_time_filter: bool = False, + ) -> Dict: + """Create time-based query filters based on dump configuration. + + :param orm_type: _description_ + :param dump_times: _description_ + :param include_time_filter: _description_, defaults to True + :return: _description_ + """ + filters: dict = {} + + # --- Time Filters --- + # Skip adding mtime filter if ignore_time_filter is True or no time filter is explicitly requested + if not include_time_filter or ignore_time_filter: + logger.debug(f'Skipping time filters for {orm_type.__name__}.') + return filters + + time_filters = {} + # TODO: Use current dump time instead of now + now = dump_times.current + + # --- Determine Upper Bound (end_date or now) --- + upper_bound = now + if self.config.end_date is not None: + # Pydantic should have validated/parsed already + upper_bound = min(now, self.config.end_date.astimezone()) + logger.debug(f'Using explicit end_date: {upper_bound.isoformat()}') + time_filters['<='] = upper_bound + + # --- Determine Lower Bound (past_days, start_date, last_dump_time) --- + lower_bound: Optional[datetime] = None + + # 1. Priority: past_days + if self.config.past_days is not None: + try: + days = int(self.config.past_days) + lower_bound = now - timedelta(days=days) + logger.debug(f'Using past_days={days}, calculated start date: {lower_bound.isoformat()}') + except (ValueError, TypeError): + logger.warning(f"Invalid value for past_days: '{self.config.past_days}'. Ignoring.") + + # 2. Next Priority: start_date (only if past_days wasn't used) + elif self.config.start_date is not None: + # Pydantic should have validated/parsed already + lower_bound = self.config.start_date.astimezone() + logger.debug(f'Using explicit start_date: {lower_bound.isoformat()}') + + # 3. Fallback: filter_by_last_dump_time (only if others weren't used) + elif self.config.filter_by_last_dump_time and dump_times.last is not None: + lower_bound = dump_times.last.astimezone() + logger.debug(f'Using last dump time as start date: {lower_bound.isoformat()}') + + if lower_bound: + # Ensure lower bound is not after upper bound + if lower_bound > upper_bound: + msg = ( + f'Calculated start time {lower_bound.isoformat()} is after end time ' + '{upper_bound.isoformat()}. Query might yield no results.' + ) + logger.warning(msg) + # Adjust lower_bound to upper_bound to avoid invalid range? Or let QB handle it? Let QB handle. + time_filters['>='] = lower_bound + + if time_filters: + filters['mtime'] = time_filters + logger.debug(f'Applying time filters for {orm_type.__name__}: {time_filters}') + else: + logger.debug(f'No time filters applied for {orm_type.__name__}.') + + return filters + + def _resolve_qb_appends(self, qb: QueryBuilder, orm_type: Type[Node]) -> Tuple[QueryBuilder, Dict]: + """Appends related entity filters (User, Computer, Code) based on config. + + :param qb: _description_ + :param orm_type: _description_ + :return: _description_ + """ + relationships_to_add = {} + + # Use tags defined in the class + user_tag = self.USER_TAG + computer_tag = self.COMPUTER_TAG + code_tag = self.CODE_TAG + + # --- User Filter --- + if self.config.user: + # Ensure user is loaded correctly before accessing pk + if isinstance(self.config.user, orm.User) and self.config.user.pk is not None: + qb.append(orm.User, filters={'id': self.config.user.pk}, tag=user_tag) + relationships_to_add['with_user'] = user_tag + logger.debug(f'QB: Appending User filter for {self.config.user.email}') + else: + logger.warning(f'Invalid or unloaded user provided: {self.config.user}. Skipping filter.') + + # --- Computer Filter --- + if self.config.computers: + computer_pks = [ + comp.pk for comp in self.config.computers if isinstance(comp, orm.Computer) and comp.pk is not None + ] + if computer_pks: + qb.append( + orm.Computer, + filters={'id': {'in': computer_pks}}, + tag=computer_tag, + ) + # Check if relationship is 'with_computer' or something else based on orm_type + relationships_to_add['with_computer'] = computer_tag + # Add other relationships if needed for different node types + logger.debug(f'QB: Appending Computer filter for PKs: {computer_pks}') + # Check if list was provided but contained invalid items + elif self.config.computers: + logger.warning('Computer filter provided, but no valid/loaded Computer objects found. Skipping.') + + # --- Code Filter --- + if self.config.codes: + code_pks = [code.pk for code in self.config.codes if isinstance(code, orm.Code) and code.pk is not None] + if code_pks: + qb.append(orm.Code, filters={'id': {'in': code_pks}}, tag=code_tag) + # NOTE: Not sure if this is needed + relationships_to_add['with_incoming'] = code_tag + logger.debug(f'QB: Appending Code filter for PKs: {code_pks}') + # Check if list was provided but contained invalid items + elif self.config.codes: + logger.warning('Code filter provided, but no valid/loaded Code objects found. Skipping.') + + return qb, relationships_to_add + + def _execute_query( + self, + qb: QueryBuilder, + orm_type_name: str, + scope: GroupDumpScope, + group: Optional[Group], + ) -> List[Node]: + """Executes the QueryBuilder query and handles results/errors. + + :param qb: _description_ + :param orm_type_name: _description_ + :param scope: _description_ + :param group: _description_ + :return: _description_ + """ + scope_detail = ( + f" in group '{group.label}'" if scope == GroupDumpScope.IN_GROUP and group else f' ({scope.name})' + ) + try: + # Ensure we project the node itself with the correct tag + if self.NODE_TAG not in qb._projections: + qb.add_projection(self.NODE_TAG, '*') + + results: list[orm.Node] = cast(list[orm.Node], qb.all(flat=True)) + logger.debug(f'Query for {orm_type_name}{scope_detail} returned {len(results)} candidate nodes.') + return results + except Exception as e: + logger.error( + f'Query failed for {orm_type_name}{scope_detail}: {e}', + exc_info=True, + ) + return [] diff --git a/src/aiida/tools/dumping/engine.py b/src/aiida/tools/dumping/engine.py new file mode 100644 index 0000000000..925cdd8b38 --- /dev/null +++ b/src/aiida/tools/dumping/engine.py @@ -0,0 +1,276 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from aiida import orm +from aiida.common.log import AIIDA_LOGGER +from aiida.tools.dumping.config import DumpConfig, DumpMode, ProfileDumpSelection +from aiida.tools.dumping.detect import DumpChangeDetector +from aiida.tools.dumping.logger import DumpLogger +from aiida.tools.dumping.managers.deletion import DeletionManager +from aiida.tools.dumping.managers.process import ProcessDumpManager +from aiida.tools.dumping.utils.helpers import DumpChanges, DumpTimes, GroupChanges +from aiida.tools.dumping.utils.paths import DumpPaths + +if TYPE_CHECKING: + from aiida.tools.dumping.mapping import GroupNodeMapping + + +logger = AIIDA_LOGGER.getChild('tools.dumping.engine') + + +class DumpEngine: + """Core engine that orchestrates the dump process.""" + + def __init__(self, dump_paths: DumpPaths, config: DumpConfig | None = None): + """Engine constructor that initializes all entities needed for dumping. + + :param dump_paths: _description_ + :param config: _description_, defaults to None + """ + + self.dump_paths = dump_paths + + # --- Resolve Configuration from file and possibly passed values --- + # Loads from YAML if present and merges with the passed config object + # if not isinstance(config, DumpConfig): + # msg = f"DumpEngine expects a DumpConfig object, got {type(config)}" + # raise TypeError(msg) + self.config: DumpConfig = config + + # --- Initialize Times, Logger, and NodeGroupMapping --- + self.dump_times, self.dump_logger, self.stored_mapping = self._initialize_times_logger_and_mapping() + + # --- Initialize detector for changes --- + self.detector = DumpChangeDetector(self.dump_logger, self.config, self.dump_times) + + # --- Initialize Managers (pass dependencies) --- + self.process_manager = ProcessDumpManager( + config=self.config, + dump_paths=dump_paths, + dump_logger=self.dump_logger, + dump_times=self.dump_times, + ) + + def _initialize_times_logger_and_mapping( + self, + ) -> tuple[DumpTimes, DumpLogger, GroupNodeMapping | None]: + """Initialize dump times, load logger data, and load stored mapping. + + :return: _description_ + """ + logger.debug('Initializing dump times and logger...') + + # Clear log file if overwriting + if self.config.dump_mode == DumpMode.OVERWRITE and self.dump_paths.log_path.exists(): + try: + logger.info(f'Overwrite mode: Deleting existing log file {self.dump_paths.log_path}') + self.dump_paths.log_path.unlink() + except OSError as e: + logger.error(f'Failed to delete existing log file: {e}') + # Decide whether to proceed or raise an error + + # Load log data, stored mapping, and last dump time string from file + stores_coll, stored_mapping, last_dump_time_str = DumpLogger.load(self.dump_paths) + + # Initialize DumpTimes based on loaded time string + dump_times = DumpTimes.from_last_log_time(last_dump_time_str) + logger.debug(f'Dump times initialized: Current={dump_times.current}, Last={dump_times.last}') + + # Initialize DumpLogger instance with loaded data + dump_logger = DumpLogger( + dump_paths=self.dump_paths, + stores=stores_coll, + last_dump_time_str=last_dump_time_str, + ) + msg = ( + f'Dump logger initialized. Found {len(dump_logger.calculations)} calc logs, ' + f'{len(dump_logger.workflows)} wf logs, {len(dump_logger.groups)} group logs.' + ) + logger.debug(msg) + + if stored_mapping: + msg = f'Loaded stored group mapping with {len(stored_mapping.group_to_nodes)} groups.' + logger.debug(msg) + else: + msg = 'No stored group mapping found in log file.' + logger.debug(msg) + + return dump_times, dump_logger, stored_mapping + + def _load_config_from_yaml(self) -> DumpConfig | None: + """Attempts to load DumpConfig from the YAML file in the dump path. + + :return: _description_ + """ + config_path = self.dump_paths.config_path + + if not config_path.exists(): + logger.debug(f'Config file {config_path} not found. Using default/provided config.') + return None + + assert self.dump_paths.top_level is not None + config_path_rel = config_path.relative_to(self.dump_paths.top_level) + logger.info(f'Loading configuration from {config_path_rel}...') + try: + loaded_config = DumpConfig.parse_yaml_file(path=config_path) + logger.info('Successfully loaded configuration from file.') + return loaded_config + except Exception as e: + logger.error(f'Failed to load or parse config file {config_path}: {e}', exc_info=True) + return None + + def _save_config(self) -> None: + """Save the Pydantic config object to the dump's YAML file.""" + # Use the save method defined on the Pydantic DumpConfig model + try: + # This calls self.config.model_dump(mode='json') internally via the serializer + # and writes to the correct path using yaml.dump + self.config._save_yaml_file(self.dump_paths.config_path) + # The logger message "Dump configuration saved to ..." is now inside save_yaml_file + except Exception as e: + # Log error but avoid raising another error during finalization if possible + # Match original behavior of logging without re-raising here. + logger.error(f'Failed to save dump configuration during engine finalization: {e}', exc_info=True) + + def dump(self, entity: orm.ProcessNode | orm.Group | None = None) -> None: + """Selects and executes the appropriate dump strategy. + + :param entity: _description_, defaults to None + """ + + entity_type_msg = 'Starting dump process of ' + if entity is None: + entity_type_msg += 'default profile' + elif isinstance(entity, orm.Group): + entity_type_msg += f'group `{entity.label}`' + elif isinstance(entity, orm.ProcessNode): + entity_type_msg += f'process with pk `{entity.pk}`' + + entity_type_msg += f' in mode: {self.config.dump_mode.name}' + + logger.report(entity_type_msg) + + # --- Prepare Top-Level Path --- + if not self.config.dump_mode == DumpMode.DRY_RUN: + try: + DumpPaths._prepare_dump_path( + path_to_validate=self.dump_paths.absolute, + dump_mode=self.config.dump_mode, + safeguard_file=self.dump_paths.safeguard_file, + ) + except (FileNotFoundError, FileExistsError, ValueError, OSError) as e: + logger.critical(f'Failed to prepare dump directory {self.dump_paths.child}: {e}') + raise e + + # --- For process dump, I don't need any complicated logic + if isinstance(entity, orm.ProcessNode): + logger.info(f'Executing Process dump for node: PK={entity.pk}') + process_node = entity + process_top_level_path = self.dump_paths.absolute + logger.info(f'Dispatching node {process_node.pk} to ProcessManager...') + self.process_manager.dump( + process_node=process_node, + target_path=process_top_level_path, + ) + logger.info(f'ProcessManager finished processing node: PK={process_node.pk}') + try: + self.process_manager.readme_generator._generate(process_node, process_top_level_path) + except Exception as e: + logger.warning(f'Failed to generate README for process {process_node.pk}: {e}') + logger.info(f'Finished Process dump for node: PK={entity.pk}') + + # No mapping evaluated for ProcessNode + self.dump_logger.save(self.dump_times.current) + + else: + from aiida.tools.dumping.managers.profile import ProfileDumpManager + + # TODO: This is a bit of a hack right now such that there + # TODO: is no additional unneccessary nesting for group dump + # TODO: We use the same code as for the dumping of a profile, but with only one group selected + if isinstance(entity, orm.Group): + self.config.organize_by_groups = False + self.config.profile_dump_selection = ProfileDumpSelection.SPECIFIC + self.config.groups = [entity.uuid] + + # --- Change Detection (for Dumping) --- + logger.info('Detecting node changes for dump...') + # node_changes now holds a NodeChanges object, current_mapping holds the mapping + node_changes, current_mapping = self.detector._detect_all_changes( + group=entity if isinstance(entity, orm.Group) else None + ) + # TODO: See if I should pass the mapping here or create it in the Manager + self.current_mapping = current_mapping + + logger.info('Detecting group changes for dump...') + group_changes: GroupChanges + group_changes = self.detector._detect_group_changes( + stored_mapping=self.stored_mapping, # Use mapping loaded at init + current_mapping=current_mapping, # Use mapping from node detection + specific_group_uuid=(entity.uuid if isinstance(entity, orm.Group) else None), + ) + + # Combine detected changes + all_changes = DumpChanges( + nodes=node_changes, + groups=group_changes, + ) + + # --- Check if any changes were detected --- + no_node_changes = len(all_changes.nodes.new_or_modified) == 0 and len(all_changes.nodes.deleted) == 0 + no_group_changes = ( + len(all_changes.groups.new) == 0 + and len(all_changes.groups.deleted) == 0 + and len(all_changes.groups.modified) == 0 + and len(all_changes.groups.renamed) == 0 + and len(all_changes.groups.node_membership) == 0 + ) + + # If `als_ungrouped` + if (no_node_changes and no_group_changes) and not self.config.also_ungrouped: + logger.report('No changes detected since last dump. Nothing to do.') + self.dump_logger.save(self.dump_times.current, self.current_mapping) + self._save_config() + return + + # --- Handle Deletion First (if requested) --- + if self.config.delete_missing: + logger.info('Deletion requested. Handling deleted entities...') + # --- Change Detection (needed *only* for deletion info) --- + logger.info('Detecting changes to identify deletions...') + # Detect node changes (yields NodeChanges and mapping) + # TODO: Independent of groups. Or is it? + deletion_manager = DeletionManager( + config=self.config, + dump_paths=self.dump_paths, + dump_logger=self.dump_logger, + dump_changes=all_changes, + stored_mapping=self.stored_mapping, + ) + deletion_manager._handle_deleted_entities() + + if self.config.dump_mode == DumpMode.DRY_RUN: + change_table = all_changes.to_table() + print(change_table) + return + + profile_manager = ProfileDumpManager( + selected_group=entity, + config=self.config, + dump_paths=self.dump_paths, + dump_logger=self.dump_logger, + process_manager=self.process_manager, + current_mapping=current_mapping, + detector=self.detector, + ) + + # Call the newly instantiated ProfileDumpManager + profile_manager.dump(changes=all_changes) + + # No mapping evaluated for ProcessNode + self.dump_logger.save(self.dump_times.current, current_mapping) + + # --- Finalize --- + logger.report('Saving final dump log, mapping, and configuration...') + self._save_config() diff --git a/src/aiida/tools/dumping/facades.py b/src/aiida/tools/dumping/facades.py new file mode 100644 index 0000000000..67f2b1fe49 --- /dev/null +++ b/src/aiida/tools/dumping/facades.py @@ -0,0 +1,350 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Dumper facades serving as public API for data dumping feature.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, Type + +from aiida import orm +from aiida.common.exceptions import NotExistent +from aiida.common.log import AIIDA_LOGGER +from aiida.tools.dumping.config import DumpConfig +from aiida.tools.dumping.engine import DumpEngine +from aiida.tools.dumping.utils.paths import DumpPaths + +logger = AIIDA_LOGGER.getChild('tools.dumping.facades') + + +class ProcessDumper: + """Dumps data of a single ProcessNode.""" + + @classmethod + def from_config( + cls, + process: orm.ProcessNode | int | str, + config_path: str | Path, + output_path: Path | str | None = None, + **kwargs, + ) -> 'ProcessDumper': + """Creates a ProcessDumper instance configured from a YAML file. + + :param process: Identifier for the selected ``ProcessNode``. + :param config_path: Path of the dump config file. + :param output_path: Dump output path. If not given, same directory as where the config file is located. + :return: ProcessDumper instance. + """ + + verified_process = cls._verify_process_node(process) + config_path = Path(config_path).resolve() + + # Determine the final output path + if output_path is None: + resolved_path = Path(config_path).parent.resolve() + logger.info(f'No output_path specified, using default: {resolved_path}') + else: + resolved_path = Path(output_path).resolve() + + # Prepare extra args for __init__ + init_extra_args = {'process': verified_process} + + # Delegate to the helper function + return _create_dumper_instance( + cls=cls, + config_path=config_path, + output_path=resolved_path, + cls_init_extra_args=init_extra_args, + **kwargs, + ) + + def __init__( + self, + process: orm.ProcessNode | int | str, + config: DumpConfig | None = None, + output_path: str | Path | None = None, + ) -> None: + """Initialize the ProcessDumper, which handles exporting a single AiiDA ProcessNode. + + :param process: The ``ProcessNode`` to dump, either given as ORM instance, or its PK or UUID. + :param config: An optional ``DumpConfig`` object that controls what data to include in the dump. + If ``None``, default dump settings are used. + :param output_path: Optional base path to write the dump to. Can be a string or ``Path``. + If ``None``, a default path based on the profile name will be used. + """ + self.process_node = ProcessDumper._verify_process_node(process) + self.config: DumpConfig = config if config is not None else DumpConfig() + + # Resolve DumpPaths based on output_path and the node + if output_path is None: + default_path = DumpPaths._get_default_process_dump_path(process_node=self.process_node) + self.dump_paths = DumpPaths(parent=Path.cwd(), child=default_path) + else: + self.dump_paths = DumpPaths.from_path(Path(output_path).resolve()) + + @staticmethod + def _verify_process_node( + identifier: orm.ProcessNode | int | str, + ) -> orm.ProcessNode: + """Verify that an identifier yields a valid ProcessNode instance. + + :raises NotExistent: If node not found for identifier. + :raises TypeError: If loaded node is not a ProcessNode or input type is wrong. + :raises ValueError: If another loading error occurs. + """ + if isinstance(identifier, orm.ProcessNode): + return identifier + elif isinstance(identifier, (int, str)): + try: + loaded_node: orm.Node = orm.load_node(identifier=identifier) + except NotExistent: + logger.error(f"Process node with identifier '{identifier}' not found.") + raise + except Exception as exc: + msg = f"Error loading process node via identifier '{identifier}': {exc}" + raise ValueError(msg) from exc + + if not isinstance(loaded_node, orm.ProcessNode): + msg = f'Node <{loaded_node.pk}> loaded, but it is not an orm.ProcessNode.' + raise TypeError(msg) + return loaded_node + else: + msg = f'Invalid type for process identifier: {type(identifier)}' + raise TypeError(msg) + + def dump(self) -> None: + """Perform the dump operation by invoking the engine.""" + # Instantiate engine for dump operation rather than on construction such that + # Successive incremental dumps can be achieved with one instance + engine = DumpEngine(config=self.config, dump_paths=self.dump_paths) + engine.dump(entity=self.process_node) + + +class GroupDumper: + """Dumps data in an AiiDA group.""" + + @classmethod + def from_config( + cls, + group: orm.Group | str | int, + config_path: str | Path, + output_path: Path | str | None = None, + **kwargs, + ) -> 'GroupDumper': + """Creates a GroupDumper instance configured from a YAML file. + + :param process: Identifier for the selected ``Group``. + :param config_path: Path of the dump config file. + :param output_path: Dump output path. If not given, same directory as where the config file is located. + :return: GroupDumper instance. + """ + verified_group = cls._verify_group(group) + config_path = Path(config_path).resolve() + + # Determine the final output path + if output_path is None: + resolved_path = Path(config_path).parent.resolve() + logger.info(f'No output_path specified, using default: {resolved_path}') + else: + resolved_path = Path(output_path).resolve() + + # Prepare extra args for __init__ + init_extra_args = {'group': verified_group} + + # Delegate to the helper function + return _create_dumper_instance( + cls=cls, + config_path=config_path, + output_path=resolved_path, + cls_init_extra_args=init_extra_args, + **kwargs, + ) + + def __init__( + self, + group: orm.Group | str, + config: DumpConfig | None = None, + output_path: str | Path | None = None, + ) -> None: + """Initialize the GroupDumper, which handles exporting the data in an AiiDA group. + + :param group: The Group to dump, either given as ORM instance, or its PK or UUID. + :param config: An optional ``DumpConfig`` object that controls what data to include in the dump. + If ``None``, default dump settings are used. + :param output_path: Optional base path to write the dump to. Can be a string or ``Path``. + If ``None``, a default path based on the group label name will be used. + """ + + self.group: orm.Group = GroupDumper._verify_group(group) + self.config: DumpConfig = config if config is not None else DumpConfig() + + if output_path is None: + default_path = DumpPaths._get_default_group_dump_path(self.group) + self.dump_paths = DumpPaths(parent=Path.cwd(), child=default_path) + else: + self.dump_paths = DumpPaths.from_path(Path(output_path).resolve()) + + @staticmethod + def _verify_group(identifier: orm.Group | str | int) -> orm.Group: + """Verify the input is a valid Group instance or load it. + + :raises NotExistent: If group not found for identifier. + :raises TypeError: If input type is wrong. + :raises ValueError: If another loading error occurs. + """ + if isinstance(identifier, orm.Group): + # If it's already a Group instance, just return it + return identifier + elif isinstance(identifier, (str, int)): + try: + return orm.load_group(identifier=identifier) + except NotExistent: + logger.error(f"Group with identifier '{identifier}' not found.") + raise + except Exception as exc: + msg = f"Error loading group via identifier '{identifier}': {exc}" + raise ValueError(msg) from exc + else: + msg = f'Invalid type for group identifier: {type(identifier)}' + raise TypeError(msg) + + def dump(self): + """Perform the dump operation. Simply delegate to the engine.""" + # Instantiate engine for dump operation rather than on construction such that + # Successive incremental dumps can be achieved with one instance + engine = DumpEngine(config=self.config, dump_paths=self.dump_paths) + engine.dump(entity=self.group) + + +class ProfileDumper: + """Dumps data from the currently loaded AiiDA profile.""" + + @classmethod + def from_config( + cls, + config_path: str | Path, + output_path: Path | str | None = None, + **kwargs, + ) -> 'ProfileDumper': + """Creates a ProfileDumper instance configured from a YAML file. + + :param config_path: Path of the dump config file. + :param output_path: Dump output path. If not given, same directory as where the config file is located. + :return: ProfileDumper instance. + """ + config_path = Path(config_path).resolve() + + # Determine the final output path + if output_path is None: + resolved_path = Path(config_path).parent.resolve() + else: + resolved_path = Path(output_path).resolve() + + # No extra args needed for ProfileDumper.__init__ + init_extra_args = {} + + # Delegate to the helper function + return _create_dumper_instance( + cls=cls, + config_path=config_path, + output_path=resolved_path, + cls_init_extra_args=init_extra_args, + **kwargs, + ) + + def __init__(self, config: DumpConfig | None = None, output_path: str | Path | None = None) -> None: + """Initialize the ProfileDumper, which handles exporting data from an AiiDA profile. + + :param profile: The AiiDA profile to dump. Can be a `Profile` object, a profile name as a string, + or ``None`` to use the currently loaded profile. + :param config: An optional ``DumpConfig`` object that controls what data to include in the dump. + If ``None``, default dump settings are used. + :param output_path: Optional base path to write the dump to. Can be a string or ``Path``. + If ``None``, a default path based on the profile name will be used. + """ + + self.config: DumpConfig = config if config is not None else DumpConfig() + + if output_path is None: + default_path = DumpPaths._get_default_profile_dump_path() + self.dump_paths = DumpPaths(parent=Path.cwd(), child=default_path) + else: + self.dump_paths = DumpPaths.from_path(Path(output_path).resolve()) + + def dump(self): + """Perform the dump operation. This simply delegates to the engine.""" + # Instantiate engine for dump operation rather than on construction such that + # Successive incremental dumps can be achieved with one instance + engine = DumpEngine(config=self.config, dump_paths=self.dump_paths) + engine.dump() + + +def _apply_kwargs_overrides(config: DumpConfig, **kwargs) -> DumpConfig: + """Applies kwargs overrides to a Pydantic DumpConfig object.""" + if not kwargs: + return config + + try: + # Filter kwargs to only include valid field names for DumpConfig + # Pydantic's model_fields gives valid field names + valid_field_names = set(config.model_fields.keys()) + valid_kwargs = {k: v for k, v in kwargs.items() if k in valid_field_names} + ignored_kwargs = {k: v for k, v in kwargs.items() if k not in valid_field_names} + + if ignored_kwargs: + logger.warning(f'Ignoring unknown configuration kwargs: {list(ignored_kwargs.keys())}') + + if not valid_kwargs: + return config + + # Use model_copy which handles validation on update + final_config = config.model_copy(update=valid_kwargs) + return final_config + + except Exception as e: + msg = f'Error applying kwargs overrides: {e}. Returning original config.' + logger.error(msg, exc_info=True) + return config + + +def _create_dumper_instance( + *, + cls: Type[ProcessDumper | GroupDumper | ProfileDumper], + config_path: str | Path, + output_path: Path, + cls_init_extra_args: Dict[str, Any], + **kwargs, +) -> ProcessDumper | GroupDumper | ProfileDumper: + """Internal helper to load config, apply overrides, create paths, and instantiate a dumper facade. + + :param cls: The ``Dumper`` class that should be instantiated. + :param config_path: Path of the dump config file. + :param output_path: Dump output path. If not given, same directory as where the config file is located. + :param cls_init_extra_args: Additional process or group identifier. + :return: Instantiated ``Dumper`` class. + """ + # 1. Load config from file using Pydantic parsing + try: + loaded_config = DumpConfig.parse_yaml_file(config_path) + except (FileNotFoundError, ValueError) as e: + # Log details before re-raising for the specific facade context + logger.error(f'Error loading config for {cls.__name__}: {e}.') + raise + + # 2. Apply kwargs overrides to the loaded config + final_config = _apply_kwargs_overrides(loaded_config, **kwargs) + + # 3. Instantiate and return the specific dumper facade class + # Combine config, paths, and any extra args needed by the specific __init__ + init_args = { + 'config': final_config, + 'output_path': output_path, + **cls_init_extra_args, # Add process/group if provided + } + return cls(**init_args) diff --git a/src/aiida/tools/dumping/logger.py b/src/aiida/tools/dumping/logger.py new file mode 100644 index 0000000000..c41c3dc84f --- /dev/null +++ b/src/aiida/tools/dumping/logger.py @@ -0,0 +1,536 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### + +from __future__ import annotations + +import json +from collections.abc import Collection +from dataclasses import dataclass, field, fields +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +from aiida.common import timezone +from aiida.common.log import AIIDA_LOGGER +from aiida.tools.dumping.mapping import GroupNodeMapping +from aiida.tools.dumping.utils.helpers import DumpStoreKeys, StoreNameType +from aiida.tools.dumping.utils.paths import DumpPaths + +logger = AIIDA_LOGGER.getChild('tools.dumping.logger') + + +@dataclass +class DumpLog: + """Represents a single dump log entry.""" + + path: Path + symlinks: List[Path] = field(default_factory=list) + duplicates: List[Path] = field(default_factory=list) + dir_mtime: Optional[datetime] = None + dir_size: Optional[int] = None + + def to_dict(self) -> dict: + # Add mtime serialization if included + return { + 'path': str(self.path), + 'symlinks': [str(path) for path in self.symlinks] if self.symlinks else [], + 'duplicates': [str(path) for path in self.duplicates], + 'dir_mtime': self.dir_mtime.isoformat() if self.dir_mtime else None, + 'dir_size': self.dir_size, + } + + @classmethod + def from_dict(cls, data: dict) -> 'DumpLog': + symlinks = [] + if data.get('symlinks'): + symlinks = [Path(path) for path in data['symlinks']] + # Add mtime deserialization if included + # mtime = datetime.fromisoformat(data['mtime']) if data.get('mtime') else None + duplicates = [] + if data.get('duplicates'): + duplicates = [Path(path) for path in data['duplicates']] + + # Deserialize datetime from ISO string, handle None + dir_mtime_str = data.get('dir_mtime') + dir_mtime = None + if dir_mtime_str: + try: + dir_mtime = datetime.fromisoformat(dir_mtime_str) + # Ensure timezone-awareness if needed (assuming UTC if naive) + if dir_mtime.tzinfo is None: + dir_mtime = timezone.make_aware(dir_mtime) # Use AiiDA's timezone utility + except ValueError: + logger.warning(f'Could not parse dir_mtime string: {dir_mtime_str}') + + dir_size = data.get('dir_size') # Size should be stored as int + + return cls( + path=Path(data['path']), + symlinks=symlinks, + duplicates=duplicates, + dir_mtime=dir_mtime, + dir_size=dir_size, + ) + + def add_symlink(self, path: Path) -> None: + """Add a symlink path to this log entry.""" + if path not in self.symlinks: + self.symlinks.append(path) + + def remove_symlink(self, path_to_remove: Path) -> bool: + """Remove a symlink path from this log entry, comparing resolved paths.""" + resolved_path_to_remove = path_to_remove.resolve() + original_length = len(self.symlinks) + # Filter out paths that resolve to the same location + self.symlinks = [ + p + for p in self.symlinks + if not p.exists() or p.resolve() != resolved_path_to_remove # Check exists() first for broken links + ] + return len(self.symlinks) < original_length + + def add_duplicate(self, path: Path) -> None: + """Add a duplicate dump path to this log entry.""" + if path not in self.duplicates: + self.duplicates.append(path) + + def remove_duplicate(self, path: Path) -> bool: + """Remove a duplicate dump path from this log entry.""" + if path in self.duplicates: + self.duplicates.remove(path) + return True + return False + + +@dataclass +class DumpLogStore: + """A store for DumpLog entries, indexed by UUID.""" + + entries: Dict[str, DumpLog] = field(default_factory=dict) + + def add_entry(self, uuid: str, entry: DumpLog) -> None: + """Add a single entry to the container.""" + self.entries[uuid] = entry + + def add_entries(self, entries: Dict[str, DumpLog]) -> None: + """Add a collection of entries to the container.""" + self.entries.update(entries) + + def del_entry(self, uuid: str) -> bool: + """Remove a single entry by UUID.""" + if uuid in self.entries: + del self.entries[uuid] + return True + return False + + def del_entries(self, uuids: Collection[str]) -> None: + """Remove a collection of entries by UUID.""" + for uuid in uuids: + if uuid in self.entries: + del self.entries[uuid] + + def get_entry(self, uuid: str) -> Optional[DumpLog]: + """Retrieve a single entry by UUID.""" + return self.entries.get(uuid) + + def __len__(self) -> int: + """Return the number of entries in the container.""" + return len(self.entries) + + def __iter__(self): + """Iterate over all entries.""" + return iter(self.entries.items()) + + def to_dict(self) -> Dict: + return {uuid: entry.to_dict() for uuid, entry in self.entries.items()} + + @classmethod + def from_dict(cls, data: Dict) -> DumpLogStore: + store = cls() + for uuid, entry_data in data.items(): + store.entries[uuid] = DumpLog.from_dict(entry_data) + return store + + def update_paths(self, old_str: str, new_str: str) -> None: + """Update paths by replacing substrings.""" + # Keep this method as it operates solely on paths within the store + for uuid, entry in self.entries.items(): + path_str = str(entry.path) + if old_str in path_str: + entry.path = Path(path_str.replace(old_str, new_str)) + # Update symlinks + for i, symlink_path in enumerate(entry.symlinks): + symlink_str = str(symlink_path) + if old_str in symlink_str: + entry.symlinks[i] = Path(symlink_str.replace(old_str, new_str)) + # Update duplicates + updated_duplicates = [] + for duplicate_path in entry.duplicates: + duplicate_str = str(duplicate_path) + if old_str in duplicate_str: + updated_duplicates.append(Path(duplicate_str.replace(old_str, new_str))) + else: + updated_duplicates.append(duplicate_path) + entry.duplicates = updated_duplicates + + +@dataclass +class DumpLogStoreCollection: + """Represents the entire log data.""" + + calculations: DumpLogStore = field(default_factory=DumpLogStore) + workflows: DumpLogStore = field(default_factory=DumpLogStore) + groups: DumpLogStore = field(default_factory=DumpLogStore) + data: DumpLogStore = field(default_factory=DumpLogStore) + + +class DumpLogger: + """Handles loading, saving, and accessing dump log data.""" + + def __init__( + self, + dump_paths: DumpPaths, + stores: DumpLogStoreCollection, + last_dump_time_str: str | None = None, + ) -> None: + """ + Initialize the DumpLogger. Should typically be instantiated via `load`. + """ + self.dump_paths = dump_paths + # Stores are now passed in directly + self.calculations = stores.calculations + self.workflows = stores.workflows + self.groups = stores.groups + self.data = stores.data + # Store the raw string time from the log + self._last_dump_time_str = last_dump_time_str + + @staticmethod + def load( + dump_paths: DumpPaths, + ) -> Tuple[DumpLogStoreCollection, GroupNodeMapping | None, str | None]: + """Load log data and mapping from the log file. + + Returns: + A tuple containing: + - DumpLogStoreCollection: The loaded stores. + - GroupNodeMapping | None: The loaded group mapping, or None if not found/error. + - str | None: The ISO timestamp string of the last dump, or None. + + :param dump_paths: _description_ + :return: _description_ + """ + stores = DumpLogStoreCollection() # Default empty stores + group_node_mapping = None + last_dump_time_str = None + + if not dump_paths.log_path.exists(): + logger.debug(f'Log file not found at {dump_paths.log_path}, returning empty log data.') + return stores, group_node_mapping, last_dump_time_str + + try: + with dump_paths.log_path.open('r', encoding='utf-8') as f: + prev_dump_data = json.load(f) + + # Load last dump time string + last_dump_time_str = prev_dump_data.get('last_dump_time') + + # Load group-node mapping if present + if 'group_node_mapping' in prev_dump_data: + try: + group_node_mapping = GroupNodeMapping.from_dict(prev_dump_data['group_node_mapping']) + except Exception as e: + logger.warning(f'Error loading group-node mapping: {e!s}') + + # Load store data using deserialize_logs helper + stores.calculations = DumpLogger._deserialize_logs( + prev_dump_data.get('calculations', {}), dump_paths=dump_paths + ) + stores.workflows = DumpLogger._deserialize_logs(prev_dump_data.get('workflows', {}), dump_paths=dump_paths) + stores.groups = DumpLogger._deserialize_logs(prev_dump_data.get('groups', {}), dump_paths=dump_paths) + stores.data = DumpLogger._deserialize_logs(prev_dump_data.get('data', {}), dump_paths=dump_paths) + + except (json.JSONDecodeError, OSError, ValueError) as e: + logger.warning(f'Error loading dump log file {dump_paths.log_path}: {e!s}') + # Return default empty data on error + return DumpLogStoreCollection(), None, None + + return stores, group_node_mapping, last_dump_time_str + + def get_last_dump_time(self) -> datetime | None: + """Parse and return the last dump time, if available.""" + if self._last_dump_time_str: + try: + return datetime.fromisoformat(self._last_dump_time_str) + except ValueError: + logger.warning(f'Could not parse last dump time string: {self._last_dump_time_str}') + return None + + def add_entry(self, store_key: StoreNameType, uuid: str, entry: DumpLog) -> None: + """Add a log entry for a node to the specified store.""" + store = self.get_store_by_name(store_key) + store.add_entry(uuid, entry) + + def del_entry(self, store_key: StoreNameType, uuid: str) -> bool: + """Delete a log entry from the specified store.""" + store = self.get_store_by_name(store_key) + return store.del_entry(uuid) + + @property + def stores_collection(self) -> DumpLogStoreCollection: + """Retrieve the current state of the log stores as a dataclass.""" + # Corrected: use the instance's stores + return DumpLogStoreCollection( + calculations=self.calculations, + workflows=self.workflows, + groups=self.groups, + data=self.data, + ) + + # TODO: This currently requires the dump time as argument, not sure if this is what I want + def save( + self, + current_dump_time: datetime, + group_node_mapping: GroupNodeMapping | None = None, + ) -> None: + """Save the current log state and mapping to the JSON file.""" + log_dict = { + # Use the _serialize_logs helper method + 'calculations': self._serialize_logs(self.calculations), + 'workflows': self._serialize_logs(self.workflows), + 'groups': self._serialize_logs(self.groups), + 'data': self._serialize_logs(self.data), + 'last_dump_time': current_dump_time.isoformat(), + } + + if group_node_mapping: + log_dict['group_node_mapping'] = group_node_mapping.to_dict() + + try: + with self.dump_paths.log_path.open('w', encoding='utf-8') as f: + json.dump(log_dict, f, indent=4) + logger.debug(f'Dump log saved to {self.dump_paths.log_path}') + except OSError as e: + logger.error(f'Failed to save dump log to {self.dump_paths.log_path}: {e!s}') + + def _serialize_logs(self, container: DumpLogStore) -> Dict: + """Serialize log entries to a dictionary format relative to dump parent.""" + serialized = {} + for uuid, entry in container.entries.items(): + try: + # Use the DumpLog's to_dict method which now includes new fields + entry_dict = entry.to_dict() # <-- This dict now contains all fields + + # Convert paths to relative strings for serialization + # Ensure keys exist before attempting conversion + if 'path' in entry_dict and entry_dict['path'] is not None: + entry_dict['path'] = str(Path(entry_dict['path']).relative_to(self.dump_paths.parent)) + if 'symlinks' in entry_dict: + entry_dict['symlinks'] = [ + str(Path(p).relative_to(self.dump_paths.parent)) for p in entry_dict['symlinks'] + ] + if 'duplicates' in entry_dict: + entry_dict['duplicates'] = [ + str(Path(p).relative_to(self.dump_paths.parent)) for p in entry_dict['duplicates'] + ] + + # Add the complete entry dict (including mtime/size) to serialized output + serialized[uuid] = entry_dict # <-- Use the full dict from to_dict() + + except ValueError: + # Fallback if path is not relative - use absolute paths from to_dict() + msg = ( + f'Path {entry.path} or its links/duplicates not relative to {self.dump_paths.parent}.' + 'Storing absolute.' + ) + logger.warning(msg) + serialized[uuid] = entry.to_dict() # Store absolute paths using full dict + except Exception as e: + logger.error(f'Error serializing log entry for {uuid}: {e}', exc_info=True) + # Optionally add a marker for the failed entry + serialized[uuid] = {'error': 'Serialization failed'} + return serialized + + @staticmethod + def _deserialize_logs(category_data: Dict, dump_paths: DumpPaths) -> DumpLogStore: + """Deserialize log entries using DumpLog.from_dict and make paths absolute.""" + container = DumpLogStore() + for uuid, entry_data in category_data.items(): + try: + log_entry: Optional[DumpLog] = None + # Handle new format (dict) + if isinstance(entry_data, dict) and 'path' in entry_data: + # Use from_dict to get all fields correctly + log_entry = DumpLog.from_dict(entry_data) + # Now make paths absolute based on dump_paths.parent + # Note: Assumes paths in JSON are relative to dump_paths.parent + log_entry.path = dump_paths.parent / log_entry.path + log_entry.symlinks = [dump_paths.parent / p for p in log_entry.symlinks] + log_entry.duplicates = [dump_paths.parent / p for p in log_entry.duplicates] + + if log_entry: + container.add_entry(uuid, log_entry) + + except Exception as e: + logger.warning(f'Failed to deserialize log entry for UUID {uuid}: {e}', exc_info=True) # Add exc_info + return container + + def get_store_by_uuid(self, uuid: str) -> DumpLogStore | None: + """Find the store that contains the given UUID.""" + stores_coll = self.stores_collection # Use the property + for field_ in fields(stores_coll): + store = getattr(stores_coll, field_.name) + if uuid in store.entries: + return store + # Return None instead of raising NotExistent for easier checking + logger.debug(f'UUID {uuid} not found in any log store.') + return None + + def get_store_by_name(self, name: StoreNameType) -> DumpLogStore: + """Get the store by its string literal name.""" + stores_coll = self.stores_collection # Use the property + if hasattr(stores_coll, name): + return getattr(stores_coll, name) + else: + store_names = [field.name for field in fields(stores_coll)] + msg = f'Wrong store key <{name}> selected. Choose one of {store_names}.' + raise ValueError(msg) + + def get_dump_path_by_uuid(self, uuid: str) -> Optional[Path]: + """Find the dump path for an entity with the given UUID.""" + store = self.get_store_by_uuid(uuid=uuid) + if store and uuid in store.entries: + return store.entries[uuid].path + return None + + def get_store_by_orm(self, orm_type) -> DumpLogStore: + """Get the appropriate store for a given ORM type using DumpStoreKeys.""" + store_key_str = DumpStoreKeys.from_class(orm_type) + return self.get_store_by_name(store_key_str) # Use existing method + + def update_paths(self, old_base_path: Path, new_base_path: Path) -> int: + """Update all paths across all stores if they start with old_base_path. + + Replaces the old_base_path prefix with new_base_path. + + Args: + old_base_path: The absolute base path prefix to find. + new_base_path: The absolute base path prefix to replace with. + + Returns: + The total number of path entries (primary path, symlinks, duplicates) updated. + """ + update_count = 0 + logger.debug(f"Updating paths in logger: Replacing prefix '{old_base_path}' with '{new_base_path}'") + + # Ensure paths are absolute and resolved for reliable comparison + try: + old_resolved = old_base_path.resolve() + new_resolved = new_base_path.resolve() + except OSError as e: + logger.error(f'Error resolving paths for update: {e}. Aborting path update.') + return 0 + + stores_coll = self.stores_collection + for field_ in fields(stores_coll): + store: DumpLogStore = getattr(stores_coll, field_.name) + if not isinstance(store, DumpLogStore): + continue + + for uuid, entry in store.entries.items(): + updated_entry = False + # --- Update entry.path --- + try: + resolved_entry_path = entry.path.resolve() + if resolved_entry_path.is_relative_to(old_resolved): + relative_part = resolved_entry_path.relative_to(old_resolved) + new_path = new_resolved / relative_part + if entry.path != new_path: + logger.debug(f"Updating primary path for {uuid}: '{entry.path}' -> '{new_path}'") + entry.path = new_path + updated_entry = True + except (OSError, ValueError): # Handle resolve() errors or path not relative + logger.warning(f'Could not compare/update primary path for {uuid}: {entry.path}') + except Exception as e: + logger.error(f'Unexpected error updating primary path for {uuid}: {e}', exc_info=True) + + # --- Update entry.symlinks --- + updated_symlinks = [] + for symlink_path in entry.symlinks: + try: + resolved_symlink = symlink_path.resolve() + if resolved_symlink.is_relative_to(old_resolved): + relative_part = resolved_symlink.relative_to(old_resolved) + new_symlink = new_resolved / relative_part + updated_symlinks.append(new_symlink) + if symlink_path != new_symlink: + logger.debug(f"Updating symlink for {uuid}: '{symlink_path}' -> '{new_symlink}'") + updated_entry = True + else: + updated_symlinks.append(symlink_path) # Keep unchanged + except (OSError, ValueError): + logger.warning(f'Could not compare/update symlink for {uuid}: {symlink_path}') + updated_symlinks.append(symlink_path) # Keep original on error + except Exception as e: + logger.error(f'Unexpected error updating symlink for {uuid}: {e}', exc_info=True) + updated_symlinks.append(symlink_path) # Keep original on error + entry.symlinks = updated_symlinks + + # --- Update entry.duplicates --- + # (Similar logic as for symlinks) + updated_duplicates = [] + for duplicate_path in entry.duplicates: + try: + resolved_duplicate = duplicate_path.resolve() + if resolved_duplicate.is_relative_to(old_resolved): + relative_part = resolved_duplicate.relative_to(old_resolved) + new_duplicate = new_resolved / relative_part + updated_duplicates.append(new_duplicate) + if duplicate_path != new_duplicate: + logger.debug( + f"Updating duplicate path for {uuid}: '{duplicate_path}' -> '{new_duplicate}'" + ) + updated_entry = True + else: + updated_duplicates.append(duplicate_path) + except (OSError, ValueError): + logger.warning(f'Could not compare/update duplicate for {uuid}: {duplicate_path}') + updated_duplicates.append(duplicate_path) + except Exception as e: + logger.error(f'Unexpected error updating duplicate for {uuid}: {e}', exc_info=True) + updated_duplicates.append(duplicate_path) + entry.duplicates = updated_duplicates + + if updated_entry: + update_count += 1 # Count updated entries, not individual paths + + logger.info(f'Updated paths in {update_count} log entries.') + return update_count + + def remove_symlink_from_log_entry(self, node_uuid: str, symlink_path_to_remove: Path) -> bool: + """Finds the log entry for a node and removes a specific symlink path from it.""" + store = self.get_store_by_uuid(node_uuid) + if not store: + logger.warning(f'Cannot find store for node UUID {node_uuid} to remove symlink.') + return False + entry = store.get_entry(node_uuid) + if not entry: + logger.warning(f'Cannot find log entry for node UUID {node_uuid} to remove symlink.') + return False + + removed = entry.remove_symlink(symlink_path_to_remove) + if removed: + logger.debug( + f"Removed symlink reference '{symlink_path_to_remove.name}' from log entry for node {node_uuid}." + ) + else: + logger.debug( + f"Symlink reference '{symlink_path_to_remove.name}' not found in log entry for node {node_uuid}." + ) + return removed diff --git a/src/aiida/tools/dumping/managers/__init__.py b/src/aiida/tools/dumping/managers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/aiida/tools/dumping/managers/deletion.py b/src/aiida/tools/dumping/managers/deletion.py new file mode 100644 index 0000000000..537b9298a9 --- /dev/null +++ b/src/aiida/tools/dumping/managers/deletion.py @@ -0,0 +1,270 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +from aiida.common.log import AIIDA_LOGGER +from aiida.tools.dumping.utils.helpers import DumpChanges +from aiida.tools.dumping.utils.paths import DumpPaths + +logger = AIIDA_LOGGER.getChild('tools.dumping.managers.deletion') + +if TYPE_CHECKING: + from aiida.tools.dumping.config import DumpConfig + from aiida.tools.dumping.logger import DumpLogger + from aiida.tools.dumping.mapping import GroupNodeMapping + from aiida.tools.dumping.utils.helpers import GroupInfo + + +class DeletionManager: + """Executes deletion of dumped artifacts for entities deleted from the DB.""" + + def __init__( + self, + config: DumpConfig, + dump_paths: DumpPaths, + dump_logger: DumpLogger, + dump_changes: DumpChanges, + stored_mapping: GroupNodeMapping | None, + ): + """Initializes the DeletionManager. + + + :param config: _description_ + :param dump_paths: _description_ + :param dump_logger: _description_ + :param dump_changes: _description_ + :param stored_mapping: _description_ + """ + self.config: DumpConfig = config + self.dump_paths: DumpPaths = dump_paths + self.dump_logger: DumpLogger = dump_logger + self.dump_changes: DumpChanges = dump_changes + self.stored_mapping: GroupNodeMapping | None = stored_mapping + + def _handle_deleted_entities(self) -> bool: + """ + Removes dump artifacts for entities marked as deleted in the changes object. + + Args: + changes: Object containing the detected node and group deletions. + + Returns: + True if any deletion action (directory or log entry) was performed, False otherwise. + """ + node_uuids_to_delete: set[str] = self.dump_changes.nodes.deleted + group_info_to_delete: list[GroupInfo] = self.dump_changes.groups.deleted + + if not node_uuids_to_delete and not group_info_to_delete: + logger.info('No deleted entities identified in changes object.') + return False + + logger.info('Processing deletions based on detected changes...') + something_deleted = False + + # --- Process Node Deletions (Nodes deleted directly from DB) --- + if node_uuids_to_delete: + logger.report(f'Removing artifacts for {len(node_uuids_to_delete)} deleted nodes...') + for node_uuid in node_uuids_to_delete: + if self._delete_node_from_logger_and_disk(node_uuid): + something_deleted = True + else: + logger.info('No deleted nodes to process.') + + # --- Process Group Deletions (Groups deleted from DB) --- + if group_info_to_delete: + logger.report(f'Removing artifacts for {len(group_info_to_delete)} deleted groups...') + # Extract UUIDs from the GroupInfo objects + group_uuids_to_delete = {g.uuid for g in group_info_to_delete} + for group_uuid in group_uuids_to_delete: + # This method now also handles associated node log entries + if self._delete_group_and_associated_node_logs(group_uuid): + something_deleted = True + else: + logger.info('No deleted groups to process.') + + return something_deleted + + def _delete_node_from_logger_and_disk(self, uuid: str) -> bool: + """ + Helper to remove a node's primary dump directory and its entire log entry. + + Returns: + True if the node's log entry was successfully deleted, False otherwise. + """ + store = self.dump_logger.get_store_by_uuid(uuid) + if not store: + # It might have already been deleted if associated with a deleted group below + logger.debug( + f'Log store not found for deleted node UUID {uuid} (might be expected). Cannot remove further.' + ) + return False # Indicate log entry wasn't deleted *by this call* + + entry = store.get_entry(uuid) + if not entry: + logger.warning(f'Log entry not found for node UUID {uuid} in its store. Cannot remove.') + return False # Indicate log entry wasn't deleted + + path_to_delete = entry.path + # Determine store key for deletion from logger + store_key = next( + ( + s_name + for s_name in ['calculations', 'workflows', 'data'] + if getattr(self.dump_logger, s_name, None) == store + ), + None, + ) + if not store_key: + logger.error(f'Consistency error: Could not determine store key name for node {uuid}.') + # Try deleting log entry anyway? Or return False? Let's try deleting. + # Fallback: Attempt deletion without knowing the exact store key (less ideal) + # deleted_from_log = store.del_entry(uuid) # Assumes store has del_entry + # For now, return False as we couldn't guarantee deletion from the main logger structure + return False + + deleted_from_log = False + try: + # Attempt to delete directory first (use appropriate safeguard) + # TODO: Adjust safeguard if Data nodes use a different one + rel_path = path_to_delete.relative_to(self.dump_paths.parent) + msg = f"Deleting directory '{rel_path}' for deleted node UUID {uuid}" + logger.report(msg) + DumpPaths._safe_delete_dir(path=path_to_delete, safeguard_file=DumpPaths.safeguard_file) + except FileNotFoundError as e: + logger.warning( + f'Directory or safeguard file not found for deleted node {uuid} at {path_to_delete}: {e}. ' + f'Proceeding to remove log entry.' + ) + except Exception as e: + logger.error( + f'Failed to delete directory for deleted node {uuid} at {path_to_delete}: {e}. ' + f'Proceeding to remove log entry.', + exc_info=True, + ) + finally: + # Always attempt to remove the log entry + if self.dump_logger.del_entry(store_key=store_key, uuid=uuid): + logger.debug(f"Removed log entry for deleted node {uuid} from store '{store_key}'.") + deleted_from_log = True + else: + # This might happen if it was already removed via group deletion logic + logger.debug( + f'Log entry for deleted node {uuid} potentially already removed ' + f'(e.g., via group deletion). Store: {store_key}.' + ) + + return deleted_from_log + + def _delete_group_and_associated_node_logs(self, group_uuid: str) -> bool: + """ + Removes a group's log entry, its dump directory (if applicable), + and any node log entries whose primary dump path was within that directory. + + Args: + group_uuid: The UUID of the group deleted from the database. + + Returns: + True if the group's log entry was successfully deleted, False otherwise. + """ + group_log_deleted = False + path_deleted: Path | None = None # Keep track of the path we deleted + + # --- 1. Delete Group Directory (if applicable) --- + group_entry = self.dump_logger.groups.get_entry(group_uuid) + if group_entry: + path_to_delete = group_entry.path + should_delete_dir = self.config.organize_by_groups and path_to_delete != self.dump_paths.absolute + if should_delete_dir: + try: + rel_path_str = 'unknown' + try: + rel_path = path_to_delete.relative_to(self.dump_paths.absolute) + rel_path_str = str(rel_path) + except ValueError: + rel_path_str = str(path_to_delete) + + logger.report(f"Deleting directory '{rel_path_str}' for deleted group UUID {group_uuid}") + DumpPaths._safe_delete_dir(path=path_to_delete, safeguard_file=DumpPaths.safeguard_file) + path_deleted = path_to_delete # Record that we deleted this path + except FileNotFoundError: + msg = ( + 'Safeguard check failed or directory not found for deleted group ' + '{group_uuid} at {path_to_delete}: {e}' + ) + logger.warning(msg) + # If directory wasn't found, still potentially record its path for log cleanup + path_deleted = path_to_delete + except Exception as e: + logger.error( + f'Failed to delete directory for deleted group {group_uuid} at {path_to_delete}: {e}', + exc_info=True, + ) + else: + logger.debug(f'Not deleting directory for group {group_uuid} (flat structure or root path).') + else: + logger.warning(f'Log entry not found for deleted group UUID {group_uuid}. Cannot remove directory.') + + # --- 2. Delete Group Log Entry --- + if self.dump_logger.del_entry(store_key='groups', uuid=group_uuid): + logger.debug(f'Removed log entry for deleted group {group_uuid}.') + group_log_deleted = True + else: + logger.warning(f'Failed to remove log entry for deleted group {group_uuid} (may have been missing).') + + # --- 3. NEW: Delete Node Log Entries Based on Path --- + nodes_removed_count = 0 + # Only proceed if we identified a group directory path (even if deletion failed) + if path_deleted: + logger.info(f'Scanning node logs for entries within deleted group path: {path_deleted}') + # Iterate through all potential node stores + for store_key in ['calculations', 'workflows', 'data']: + node_store = getattr(self.dump_logger, store_key, None) + if not node_store or not hasattr(node_store, 'entries'): + continue + + # Need to copy keys as we modify the dictionary during iteration + node_uuids_in_store = list(node_store.entries.keys()) + for node_uuid in node_uuids_in_store: + node_log_entry = node_store.get_entry(node_uuid) + if not node_log_entry or not node_log_entry.path: + continue # Skip if entry or path is missing + + try: + # Check if the node's primary logged path is inside the deleted group path + # Use resolve() for robust comparison, handle potential errors + if node_log_entry.path.resolve().is_relative_to(path_deleted.resolve()): + msg = ( + f"Node {node_uuid} path '{node_log_entry.path}' is within deleted " + "group path '{path_deleted}'. Removing log entry." + ) + logger.debug(msg) + if self.dump_logger.del_entry(store_key=store_key, uuid=node_uuid): + nodes_removed_count += 1 + # else: No warning needed if removal fails, might be race condition or prior removal + except (OSError, ValueError): + # Errors can happen if paths don't exist when resolve() is called + msg = ( + f'Could not resolve/compare path for node {node_uuid} ' + '({node_log_entry.path}) relative to {path_deleted}: {e}' + ) + logger.warning(msg) + except Exception as e: + logger.error(f'Unexpected error checking path for node {node_uuid}: {e}', exc_info=True) + + if nodes_removed_count > 0: + msg = ( + f'Removed log entries for {nodes_removed_count} nodes whose dump path was within ' + f"the deleted group directory '{path_deleted.name}'." + ) + logger.report(msg) + else: + msg = ( + f'No group directory path identified for deleted group {group_uuid}. ' + 'Skipping path-based node log cleanup.' + ) + logger.debug(msg) + + # Note: The previous logic using stored_mapping is removed as this path-based approach is more direct. + + return group_log_deleted diff --git a/src/aiida/tools/dumping/managers/process.py b/src/aiida/tools/dumping/managers/process.py new file mode 100644 index 0000000000..6f892339c0 --- /dev/null +++ b/src/aiida/tools/dumping/managers/process.py @@ -0,0 +1,768 @@ +########################################################################## +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Manager that deals with dumping a single ProcessNode.""" + +from __future__ import annotations + +import contextlib +import os +from enum import Enum, auto +from pathlib import Path +from types import SimpleNamespace +from typing import TYPE_CHECKING, Callable, Optional, Tuple + +import yaml + +from aiida import orm +from aiida.common import LinkType, timezone +from aiida.common.log import AIIDA_LOGGER +from aiida.orm.utils import LinkTriple +from aiida.tools.archive.exceptions import ExportValidationError +from aiida.tools.dumping.config import DumpConfig +from aiida.tools.dumping.logger import DumpLog +from aiida.tools.dumping.utils.helpers import DumpStoreKeys +from aiida.tools.dumping.utils.paths import DumpPaths + +if TYPE_CHECKING: + from aiida.tools.dumping.logger import DumpLogger + from aiida.tools.dumping.utils.helpers import DumpTimes + +__all__ = ('NodeMetadataWriter', 'NodeRepoIoDumper', 'ProcessDumpManager', 'ReadmeGenerator', 'WorkflowWalker') + +logger = AIIDA_LOGGER.getChild('tools.dumping.managers.process') + +# Type hint for the recursive dump function expected by WorkflowWalker +DumpProcessorType = Callable[[orm.ProcessNode, Path], None] + + +# TODO: end-user should specify a few actions what to do when duplicate entities are encountered +# TODO: This should be general `--handle-duplicates` or sthg +class NodeDumpAction(Enum): + """Represents the action determined after checking the log.""" + + SKIP = auto() + SYMLINK = auto() + DUMP_PRIMARY = auto() + DUMP_DUPLICATE = auto() + UPDATE = auto() + ERROR = auto() + + +class ProcessDumpManager: + """Handles the processing and dumping of individual process nodes""" + + def __init__( + self, + config: DumpConfig, + dump_paths: DumpPaths, + dump_logger: DumpLogger, + dump_times: DumpTimes, + ): + self.config: DumpConfig = config + self.dump_paths: DumpPaths = dump_paths + self.dump_logger: DumpLogger = dump_logger + self.dump_times: DumpTimes = dump_times + + # Instantiate helper classes + self.metadata_writer = NodeMetadataWriter(config) + self.repo_io_dumper = NodeRepoIoDumper(config) + # Pass the bound method self._dump_process_recursive_entry for recursion + self.workflow_walker = WorkflowWalker(self._dump_process_recursive_entry) + self.readme_generator = ReadmeGenerator() + + def dump( + self, + process_node: orm.ProcessNode, + group: orm.Group | None = None, + target_path: Path | None = None, + ): + """ + Main entry point to dump a single ProcessNode. + Determines the required action and delegates to execution methods. + """ + if not target_path: + target_path = self._get_node_base_path(node=process_node, group=group) + + if not self._validate_node_for_dump(process_node): + return # Validation failed, logged inside helper + + action, existing_log_entry = self._check_log_and_determine_action( + node=process_node, + target_path=target_path, + ) + + # Delegate execution based on the determined action + try: + if action == NodeDumpAction.SKIP: + self._execute_skip(process_node) + elif action == NodeDumpAction.SYMLINK: + self._execute_symlink(process_node, target_path, existing_log_entry) + elif action == NodeDumpAction.UPDATE: + self._execute_update(process_node, target_path, existing_log_entry) + elif action == NodeDumpAction.DUMP_PRIMARY: + self._execute_dump_primary(process_node, target_path) + elif action == NodeDumpAction.DUMP_DUPLICATE: + self._execute_dump_duplicate(process_node, target_path, existing_log_entry) + elif action == NodeDumpAction.ERROR: + self._execute_error(process_node) + except Exception as e: + # Catch errors during execution phase + logger.error( + f'Unhandled exception during execution of action {action.name} ' + f'for node PK={process_node.pk} at {target_path}: {e}', + exc_info=True, + ) + # Decide on cleanup: only cleanup primary if primary failed + is_primary_attempt = action == NodeDumpAction.DUMP_PRIMARY + self._cleanup_failed_dump(process_node, target_path, is_primary_attempt) + + def _dump_process_recursive_entry( + self, + process_node: orm.ProcessNode, + target_path: Path, + group: orm.Group | None = None, # Accept group context + ): + """Entry point for recursive calls from WorkflowWalker.""" + logger.debug(f'Recursive dump call for child node {process_node.pk} -> {target_path.name}') + # Use the main dump logic for recursive calls as well + self.dump(process_node=process_node, group=group, target_path=target_path) + + def _check_log_and_determine_action( + self, + node: orm.ProcessNode, + target_path: Path, + ) -> Tuple[NodeDumpAction, Optional[DumpLog]]: + """ + Checks the logger and node status to determine the appropriate dump action. + This method should NOT have side effects like creating files/dirs or modifying logs. + """ + store_key = DumpStoreKeys.from_instance(node) + node_store = self.dump_logger.get_store_by_name(store_key) + existing_log_entry = node_store.get_entry(node.uuid) + + if not existing_log_entry: + logger.debug(f'Node {node.pk} not found in log. Action: DUMP_PRIMARY') + return NodeDumpAction.DUMP_PRIMARY, None + + # Node is logged. Check paths. + try: + resolved_target_path = target_path.resolve() + resolved_logged_path = existing_log_entry.path.resolve() + except (OSError, ValueError) as e: + msg = ( + f'Error resolving/comparing paths for node {node.pk} ({target_path} vs {existing_log_entry.path}): {e}.' + 'Action: ERROR.' + ) + logger.error(msg) + return NodeDumpAction.ERROR, existing_log_entry + + # --- Case 1: Target path is the SAME as the primary logged path --- + if resolved_target_path == resolved_logged_path: + # Check if node mtime is newer than logged dump mtime + node_mtime = node.mtime + logged_dir_mtime = existing_log_entry.dir_mtime + + needs_update = False + if logged_dir_mtime is None: + needs_update = True + logger.debug(f'Node {node.pk} needs update: Logged dir_mtime is missing.') + # Ensure comparison is between timezone-aware datetimes + elif node_mtime.astimezone(timezone.utc) > logged_dir_mtime.astimezone(timezone.utc): + needs_update = True + logger.debug( + f'Node {node.pk} needs update: Node mtime {node_mtime} > Logged dir_mtime {logged_dir_mtime}.' + ) + + if needs_update: + logger.debug(f'Node {node.pk} exists at target, needs update. Action: UPDATE.') + return NodeDumpAction.UPDATE, existing_log_entry + else: + logger.debug(f'Node {node.pk} exists at target, up-to-date. Action: SKIP.') + return NodeDumpAction.SKIP, existing_log_entry + + # --- Case 2: Target path is DIFFERENT from primary logged path --- + is_calc_node = isinstance(node, orm.CalculationNode) + try: + is_sub_process = ( + len(node.base.links.get_incoming(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all()) > 0 + ) + except Exception: + is_sub_process = False # Default if link check fails + + # Option A: Symlink Calculation + if self.config.symlink_calcs and is_calc_node: + logger.debug(f'Node {node.pk} is calc, symlinking enabled. Action: SYMLINK.') + return NodeDumpAction.SYMLINK, existing_log_entry + + # Option B: Force Duplicate Calculation Dump (if not symlinking) + elif is_calc_node and is_sub_process and not self.config.only_top_level_calcs: + logger.debug(f'Node {node.pk} is sub-calc, forcing duplicate dump. Action: DUMP_DUPLICATE.') + return NodeDumpAction.DUMP_DUPLICATE, existing_log_entry + + # Option C: Standard Duplicate Dump (e.g., Workflows, non-symlinked Calcs) + else: + logger.debug(f'Node {node.pk} logged elsewhere, standard duplicate case. Action: DUMP_DUPLICATE.') + return NodeDumpAction.DUMP_DUPLICATE, existing_log_entry + + def _execute_skip(self, node: orm.ProcessNode): + """Action: Skip dumping this node.""" + logger.debug(f'Skipping node {node.pk} (already dumped and up-to-date or symlinked).') + # No file operations needed + + def _execute_symlink(self, node: orm.ProcessNode, target_path: Path, existing_log_entry: DumpLog): + """Action: Create a relative symlink to the primary dump location.""" + logger.debug(f'Executing SYMLINK for node {node.pk} at {target_path.name}') + + # Avoid creating symlink if target already exists (idempotency) + if target_path.exists() or target_path.is_symlink(): + logger.warning(f'Target path {target_path.name} already exists. Skipping symlink creation.') + # Ensure log entry reflects this link target even if skipped + if target_path.resolve() not in {p.resolve() for p in existing_log_entry.symlinks if p.exists()}: + try: + existing_log_entry.add_symlink(target_path.resolve()) + except OSError as e: + logger.error(f'Could not resolve path {target_path} to add to symlinks log: {e}') + return # Skip actual symlink creation + + try: + source_path = existing_log_entry.path # Absolute path to the original dump dir + if not source_path.exists(): + logger.error(f'Source path {source_path} for node {node.pk} does not exist. Cannot symlink.') + # No cleanup needed here, just log the error + return + + link_location_dir = target_path.parent + link_location_dir.mkdir(parents=True, exist_ok=True) + relative_src_path = os.path.relpath(source_path, start=link_location_dir) + + os.symlink(relative_src_path, target_path, target_is_directory=True) + logger.info(f'Created relative symlink {target_path.name} -> {relative_src_path}') + + # Add this new symlink location to the log entry + existing_log_entry.add_symlink(target_path.resolve()) + + except OSError as e: + logger.error(f'Failed symlink creation for node {node.pk} at {target_path.name}: {e}') + except Exception as e: + logger.error(f'Unexpected error during symlink creation for node {node.pk}: {e}', exc_info=True) + + def _execute_update(self, node: orm.ProcessNode, target_path: Path, existing_log_entry: DumpLog): + """Action: Clean existing directory and perform a full dump.""" + logger.info(f'Executing UPDATE for node {node.pk} at {target_path.name} due to mtime change.') + try: + # 1. Clean existing directory + DumpPaths._safe_delete_dir(target_path, safeguard_file=DumpPaths.safeguard_file) + logger.debug(f'Cleaned existing directory for update: {target_path.name}') + + # 2. Prepare directory again + self._prepare_node_dump_directory(target_path) + + # 3. Dump content + self._dump_node_content(node, target_path) + + # 4. Update stats on the existing log entry using the primary path + self._calculate_and_update_stats(node.pk, existing_log_entry.path, existing_log_entry) + + except Exception as e: + logger.error(f'Failed during UPDATE execution for node {node.pk}: {e}', exc_info=True) + # If update fails, we might be left in a partial state. Avoid cleanup of primary log entry. + + def _execute_dump_primary(self, node: orm.ProcessNode, target_path: Path): + """Action: Perform a full dump as the primary location.""" + logger.debug(f'Executing DUMP_PRIMARY for node {node.pk} at {target_path.name}') + log_entry = None + try: + # 1. Prepare directory + self._prepare_node_dump_directory(target_path) + + # 2. Create new log entry + log_entry = DumpLog(path=target_path.resolve()) + store_key = DumpStoreKeys.from_instance(node) + self.dump_logger.get_store_by_name(store_key).add_entry(node.uuid, log_entry) + logger.debug(f'Created primary log entry for node {node.pk}') + + # 3. Dump content + self._dump_node_content(node, target_path) + + # 4. Calculate and update stats for the new log entry + self._calculate_and_update_stats(node.pk, target_path, log_entry) + + except Exception as e: + logger.error(f'Failed during DUMP_PRIMARY execution for node {node.pk}: {e}', exc_info=True) + # Cleanup directory and log entry if primary dump failed + self._cleanup_failed_dump(node, target_path, True) + + def _execute_dump_duplicate(self, node: orm.ProcessNode, target_path: Path, existing_log_entry: DumpLog): + """Action: Perform a full dump at a secondary location.""" + logger.debug(f'Executing DUMP_DUPLICATE for node {node.pk} at {target_path.name}') + try: + # 1. Prepare directory + self._prepare_node_dump_directory(target_path) + + # 2. Add path to duplicates list in existing log entry + existing_log_entry.add_duplicate(target_path.resolve()) + logger.debug(f'Added duplicate path {target_path.name} to log for node {node.pk}') + + # 3. Dump content + self._dump_node_content(node, target_path) + + # 4. Update stats on the *primary* log entry (optional, could skip for duplicates) + # Recalculating stats based on primary path might be redundant here unless content changed + # Let's keep it consistent for now. + self._calculate_and_update_stats(node.pk, existing_log_entry.path, existing_log_entry) + + except Exception as e: + logger.error(f'Failed during DUMP_DUPLICATE execution for node {node.pk}: {e}', exc_info=True) + # If duplicate dump fails, potentially clean the partial duplicate directory? + self._cleanup_failed_dump(node, target_path, False) # is_primary_dump=False + + def _execute_error(self, node: orm.ProcessNode): + """Action: Log error, do nothing else.""" + logger.error(f'Executing ERROR action for node {node.pk}. Aborting dump for this node.') + # No file operations + + # --- Helper methods --- + def _validate_node_for_dump(self, node: orm.ProcessNode) -> bool: + """Checks if the node is valid for dumping (Original Logic).""" + if not node.is_sealed and not self.config.dump_unsealed: + msg = f'Process `{node.pk}` must be sealed before it can be dumped, or `--dump-unsealed` set to True.' + raise ExportValidationError(msg) + return True + + def _prepare_node_dump_directory(self, target_path: Path): + logger.debug(f'Preparing dump directory: {target_path.name}') + try: + # Calling the utility function as in the original code + DumpPaths._prepare_dump_path( + path_to_validate=target_path, + dump_mode=self.config.dump_mode, + # TODO: Original used DumpPaths.safeguard_file + safeguard_file=DumpPaths.safeguard_file, + ) + # Original code touched the safeguard file after preparation + (target_path / DumpPaths.safeguard_file).touch(exist_ok=True) + logger.debug(f'Directory {target_path.name} prepared successfully.') + except Exception as e: + logger.error( + f'Failed preparing target path {target_path.name}: {e}', + exc_info=True, + ) + raise # Re-raise to be caught by the main dump_process try-except + + def _dump_node_content( + self, + node: orm.ProcessNode, + target_path: Path, + ): + """Dumps the actual content (metadata, repo, children) (Original Logic).""" + logger.debug(f'Dumping content for node {node.pk} into {target_path.name}') + + # 1. Write Metadata (Original Logic) + self.metadata_writer._write(node, target_path) + logger.debug(f'Metadata written for node {node.pk}') + + # 2. Ensure top-level safeguard exists (Original Logic) + (target_path / DumpPaths.safeguard_file).touch(exist_ok=True) + + # 3. Dump Repo/IO or Recurse Children (Original Logic) + if isinstance(node, orm.CalculationNode): + self.repo_io_dumper._dump_calculation_content(node, target_path) + logger.debug(f'Calculation content dumped for node {node.pk}') + elif isinstance(node, orm.WorkflowNode): + # WorkflowWalker calls _dump_process_recursive_entry + # Pass group context as potentially needed by recursive calls for symlink check + self.workflow_walker._dump_children(node, target_path) # Must ensure walker passes group context + logger.debug(f'Workflow children dumped for node {node.pk}') + + def _calculate_and_update_stats(self, node_pk: int, path_to_stat: Path, log_entry: DumpLog): + """Calculates directory stats and updates the log entry (Original Logic).""" + logger.debug(f'Calculating stats for node {node_pk} directory: {path_to_stat.name}') + try: + # Calling the utility as in original code + dir_mtime, dir_size = DumpPaths._get_directory_stats(path_to_stat) + log_entry.dir_mtime = dir_mtime + log_entry.dir_size = dir_size + logger.debug(f'Updated stats for node {node_pk}: mtime={dir_mtime}, size={dir_size} bytes') + except Exception as e: + # Original code didn't explicitly catch errors here, but added logging is good + logger.warning(f'Could not calculate/update stats for node {node_pk} at {path_to_stat}: {e}') + + def _cleanup_failed_dump( + self, + node: orm.ProcessNode, + target_path: Path, + is_primary_dump: bool, + ): + """Cleans up directory and potentially log entry on failure (Original Logic).""" + logger.warning(f'Attempting cleanup for failed dump of node {node.pk} at {target_path.name}') + try: + # Calling the utility as in original code + DumpPaths._safe_delete_dir(target_path, safeguard_file=DumpPaths.safeguard_file) + logger.info(f'Cleaned up directory {target_path.name} for failed node {node.pk}') + + if is_primary_dump: + store_key = DumpStoreKeys.from_instance(node) + node_store = self.dump_logger.get_store_by_name(store_key) + if node_store.del_entry(node.uuid): + logger.info(f'Removed log entry for failed primary dump of node {node.pk}') + else: + logger.warning(f'Could not find log entry to remove for failed primary dump of node {node.pk}') + + except Exception as cleanup_e: + msg = (f'Failed during cleanup for node {node.pk} at {target_path.name}: {cleanup_e}',) + logger.error(msg, exc_info=True) + + def _get_node_base_path(self, node: orm.Node, group: orm.Group | None) -> Path: + """Determine the correct base directory path for dumping a specific node.""" + + group_path = self.dump_paths.absolute / DumpPaths._get_group_path(group=group) + + if isinstance(node, orm.CalculationNode): + type_subdir = 'calculations' + elif isinstance(node, orm.WorkflowNode): + type_subdir = 'workflows' + elif isinstance(node, orm.Data): + type_subdir = 'data' + else: + type_subdir = 'unknown' + + if self.config.organize_by_groups: + # If organizing by groups, place inside the type subdir within the group path + node_parent_path = group_path / type_subdir + else: + # Flat structure: place inside the type subdir directly under the main dump path + node_parent_path = self.dump_paths.absolute / type_subdir + + node_parent_path.mkdir(parents=True, exist_ok=True) + # Generate the specific node directory name + node_directory_name: Path = self._generate_node_directory_name(node) + final_node_path = node_parent_path / node_directory_name + + logger.debug(f'Determined final path for node {node.pk}: {final_node_path}') + return final_node_path + + @staticmethod + def _generate_node_directory_name(node: orm.ProcessNode, append_pk: bool = True) -> Path: + """Generates the directory name for a specific node.""" + # Calling the utility function as in the original code + return DumpPaths._get_default_process_dump_path(node, append_pk=append_pk) + + @staticmethod + def _generate_child_node_label(index: int, link_triple: LinkTriple, append_pk: bool = True) -> str: + """Generate clean directory label for child nodes during recursion (Original Logic).""" + # IMPORTANT: Keeping the exact logic from the originally provided file + node = link_triple.node + link_label = link_triple.link_label + + # Generate directories with naming scheme akin to `verdi process status` + label_list = [f'{index:02d}', link_label] + + try: + process_label = node.process_label + if process_label is not None and process_label != link_label: + label_list += [process_label] + + except AttributeError: + process_type = node.process_type + if process_type is not None and process_type != link_label: + label_list += [process_type] + + if append_pk: + label_list += [str(node.pk)] + + node_label = '-'.join(label_list) + # `CALL-` as part of the link labels also for MultiplyAddWorkChain -> Seems general enough, so remove + node_label = node_label.replace('CALL-', '') + # Original code had this replacement + return node_label.replace('None-', '') + + +class NodeMetadataWriter: + """Handles writing the .aiida_node_metadata.yaml file.""" + + def __init__(self, config: DumpConfig): + self.config = config + + def _write( + self, + process_node: orm.ProcessNode, + output_path: Path, + output_filename: str = '.aiida_node_metadata.yaml', + ) -> None: + """Dump the selected ProcessNode properties, attributes, and extras to a YAML file.""" + node_properties = [ + 'label', + 'description', + 'pk', + 'uuid', + 'ctime', + 'mtime', + 'node_type', + 'process_type', + 'is_finished_ok', + ] + user_properties = ('first_name', 'last_name', 'email', 'institution') + computer_properties = ('label', 'hostname', 'scheduler_type', 'transport_type') + + metadata_dict = {prop: getattr(process_node, prop, None) for prop in node_properties} + node_dict = {'Node data': metadata_dict} + + with contextlib.suppress(AttributeError): + node_dbuser = process_node.user + user_dict = {prop: getattr(node_dbuser, prop, None) for prop in user_properties} + node_dict['User data'] = user_dict + + with contextlib.suppress(AttributeError): + node_dbcomputer = process_node.computer + if node_dbcomputer: # Check if computer is assigned + computer_dict = {prop: getattr(node_dbcomputer, prop, None) for prop in computer_properties} + node_dict['Computer data'] = computer_dict + + if self.config.include_attributes: + node_attributes = process_node.base.attributes.all + if node_attributes: + node_dict['Node attributes'] = node_attributes + + if self.config.include_extras: + node_extras = process_node.base.extras.all + if node_extras: + node_dict['Node extras'] = node_extras + + output_file = output_path / output_filename + try: + with output_file.open('w', encoding='utf-8') as handle: + # Use default_flow_style=None for better readability of nested structures + yaml.dump( + node_dict, + handle, + sort_keys=False, + default_flow_style=None, + indent=2, + ) + except Exception as e: + logger.error(f'Failed to write YAML metadata for node {process_node.pk}: {e}') + + +class NodeRepoIoDumper: + """Handles dumping repository contents and linked I/O Data nodes.""" + + def __init__(self, config: DumpConfig): + self.config = config + + def _dump_calculation_content(self, calculation_node: orm.CalculationNode, output_path: Path) -> None: + """Dump repository and I/O file contents for a CalculationNode.""" + io_dump_mapping = self._generate_calculation_io_mapping(flat=self.config.flat) + + # Dump the main repository contents + try: + repo_target = output_path / io_dump_mapping.repository + repo_target.mkdir(parents=True, exist_ok=True) + + calculation_node.base.repository.copy_tree(repo_target) + except Exception as e: + logger.error(f'Failed copying repository for calc {calculation_node.pk}: {e}') + + # Dump the repository contents of `outputs.retrieved` if it exists + if hasattr(calculation_node.outputs, 'retrieved'): + try: + retrieved_target = output_path / io_dump_mapping.retrieved + retrieved_target.mkdir(parents=True, exist_ok=True) + calculation_node.outputs.retrieved.base.repository.copy_tree(retrieved_target) + except Exception as e: + logger.error(f'Failed copying retrieved output for calc {calculation_node.pk}: {e}') + else: + logger.debug(f"No 'retrieved' output node found for calc {calculation_node.pk}.") + + # Dump the node_inputs (linked Data nodes) + if self.config.include_inputs: + try: + input_links = calculation_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC).all() + if input_links: + input_path = output_path / io_dump_mapping.inputs + # NOTE: Not needed, done in _dump_calculation_io_files + # input_path.mkdir(parents=True, exist_ok=True) + self._dump_calculation_io_files( + parent_path=input_path, + link_triples=input_links, + ) + except Exception as e: + logger.error(f'Failed dumping inputs for calc {calculation_node.pk}: {e}') + + # Dump the node_outputs (created Data nodes, excluding 'retrieved') + if self.config.include_outputs: + # TODO: Possibly also use here explicit attribute chack rather than relying on try-except + # TODO: Which might execute certain statements, then fail, and leave the result of prev. statements leftover + try: + output_links = calculation_node.base.links.get_outgoing(link_type=LinkType.CREATE).all() + output_links_filtered = [link for link in output_links if link.link_label != 'retrieved'] + # TODO: Check here if other orm types, e.g. ArrayData would also be dumped + # TODO: Also check for RemoteData if config option also remote is set + # TODO: I have a PR open on that, but for the old version of the code + has_dumpable_output = any( + isinstance(link.node, (orm.SinglefileData, orm.FolderData)) for link in output_links_filtered + ) + if output_links_filtered and has_dumpable_output: + output_path_target = output_path / io_dump_mapping.outputs + output_path_target.mkdir(parents=True, exist_ok=True) + self._dump_calculation_io_files( + parent_path=output_path_target, + link_triples=output_links_filtered, + ) + except Exception as e: + logger.error(f'Failed dumping outputs for calc {calculation_node.pk}: {e}') + + def _dump_calculation_io_files( + self, + parent_path: Path, + link_triples: list[LinkTriple], + ): + """Helper to dump linked input/output Data nodes.""" + for link_triple in link_triples: + node = link_triple.node + link_label = link_triple.link_label + try: + if not self.config.flat: + relative_parts = link_label.split('__') + linked_node_path = parent_path.joinpath(*relative_parts) + else: + # Dump content directly into parent_path, letting copy_tree handle structure + linked_node_path = parent_path + + if node.base.repository.list_object_names(): + linked_node_path.parent.mkdir(parents=True, exist_ok=True) + node.base.repository.copy_tree(linked_node_path) + except Exception as e: + logger.warning(f'Failed copying IO node {node.pk} (link: {link_label}): {e}') + + @staticmethod + def _generate_calculation_io_mapping(flat: bool = False) -> SimpleNamespace: + """Helper to map internal names to directory names for CalcNode I/O.""" + aiida_entities = ['repository', 'retrieved', 'inputs', 'outputs'] + default_dirs = ['inputs', 'outputs', 'node_inputs', 'node_outputs'] + + if flat: + # Empty string means dump into the parent directory itself + mapping = {entity: '' for entity in aiida_entities} + else: + mapping = dict(zip(aiida_entities, default_dirs)) + + return SimpleNamespace(**mapping) + + +class WorkflowWalker: + """Handles traversing WorkflowNode children and triggering their dump.""" + + def __init__(self, dump_processor: DumpProcessorType): + """ + Initialize the WorkflowWalker. + + :param dump_processor: A callable (like NodeManager.dump_process) that + takes a node and a target path to dump it. + """ + self.dump_processor = dump_processor + + def _dump_children(self, workflow_node: orm.WorkflowNode, output_path: Path) -> None: + """Find and recursively dump children of a WorkflowNode.""" + try: + called_links = workflow_node.base.links.get_outgoing( + link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK) + ).all() + called_links = sorted(called_links, key=lambda link_triple: link_triple.node.ctime) + except Exception as e: + logger.error(f'Failed getting children for workflow {workflow_node.pk}: {e}') + return + + for index, link_triple in enumerate(called_links, start=1): + child_node = link_triple.node + # Use static method from NodeManager to generate label consistently + from aiida.tools.dumping.managers.process import ProcessDumpManager + + child_label = ProcessDumpManager._generate_child_node_label(index=index, link_triple=link_triple) + child_output_path = output_path / child_label + assert isinstance(child_node, orm.ProcessNode) + try: + # Call the provided dump_processor function for the child + self.dump_processor( + child_node, + child_output_path, + ) + except Exception as e: + logger.error( + f'Failed dumping child node {child_node.pk} of workflow {workflow_node.pk}: {e}', + exc_info=True, + ) + + +class ReadmeGenerator: + """Handles generating README.md files for process nodes.""" + + def _generate(self, process_node: orm.ProcessNode, output_path: Path) -> None: + """Generate README.md file in the specified output path.""" + import textwrap + + from aiida.cmdline.utils.ascii_vis import format_call_graph + from aiida.cmdline.utils.common import ( + get_calcjob_report, + get_process_function_report, + get_workchain_report, + ) + + pk = process_node.pk + _readme_string = textwrap.dedent( + f"""\ + # AiiDA Process Dump: {process_node.process_label or process_node.process_type} <{pk}> + + This directory contains files related to the AiiDA process node {pk}. + - **UUID:** {process_node.uuid} + - **Type:** {process_node.node_type} + + Sub-directories (if present) represent called calculations or workflows, ordered by creation time. + File/directory structure within a calculation node: + - `inputs/`: Contains scheduler submission script (`_aiidasubmit.sh`), stdin file (`aiida.in`), and internal + AiiDA info (`.aiida/`). + - `outputs/`: Contains files retrieved by the parser (e.g., `aiida.out`, `_scheduler-stdout.txt`, + `_scheduler-stderr.txt`). + - `node_inputs/`: Contains repositories of input data nodes linked via `INPUT_CALC`. + - `node_outputs/`: Contains repositories of output data nodes linked via `CREATE` (excluding `retrieved`). + - `.aiida_node_metadata.yaml`: Human-readable metadata, attributes, and extras of this node. + """ + ) + + # Add status, report, show info + try: + _readme_string += ( + f'\n## Process Status (`verdi process status {pk}`)\n\n```\n{format_call_graph(process_node)}\n```\n' + ) + except Exception as e: + logger.debug(f'Could not format call graph for README: {e}') + + try: + if isinstance(process_node, orm.CalcJobNode): + report = get_calcjob_report(process_node) + elif isinstance(process_node, orm.WorkChainNode): + report = get_workchain_report(node=process_node, levelname='REPORT', indent_size=2, max_depth=None) + elif isinstance(process_node, (orm.CalcFunctionNode, orm.WorkFunctionNode)): + report = get_process_function_report(process_node) + else: + report = 'N/A' + _readme_string += f'\n## Process Report (`verdi process report {pk}`)\n\n```\n{report}\n```\n' + except Exception as e: + logger.debug(f'Could not generate process report for README: {e}') + + try: + _readme_string += ( + f'\n## Node Info (`verdi node show {process_node.uuid}`)\n\n```\n{{get_node_info(process_node)}}\n```\n' + ) + except Exception as e: + logger.debug(f'Could not get node info for README: {e}') + + try: + (output_path / 'README.md').write_text(_readme_string, encoding='utf-8') + except Exception as e: + logger.error(f'Failed to write README for node {process_node.pk}: {e}') diff --git a/src/aiida/tools/dumping/managers/profile.py b/src/aiida/tools/dumping/managers/profile.py new file mode 100644 index 0000000000..f250694e3a --- /dev/null +++ b/src/aiida/tools/dumping/managers/profile.py @@ -0,0 +1,740 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Manager that orchestrates dumping an AiiDA profile.""" + +from __future__ import annotations + +import os +from pathlib import Path +from typing import TYPE_CHECKING, cast + +from aiida import orm +from aiida.common import NotExistent +from aiida.common.log import AIIDA_LOGGER +from aiida.common.progress_reporter import get_progress_reporter, set_progress_bar_tqdm +from aiida.orm import Group, QueryBuilder, WorkflowNode +from aiida.tools.dumping.config import GroupDumpScope, ProfileDumpSelection +from aiida.tools.dumping.detect import DumpChangeDetector +from aiida.tools.dumping.logger import DumpLog, DumpLogger +from aiida.tools.dumping.utils.helpers import DumpChanges, DumpNodeStore +from aiida.tools.dumping.utils.paths import DumpPaths + +logger = AIIDA_LOGGER.getChild('tools.dumping.strategies.profile') + +if TYPE_CHECKING: + from aiida.tools.dumping.config import DumpConfig + from aiida.tools.dumping.logger import DumpLogger + from aiida.tools.dumping.managers.process import ProcessDumpManager + from aiida.tools.dumping.mapping import GroupNodeMapping + from aiida.tools.dumping.utils.helpers import GroupChanges, GroupModificationInfo + + +class ProfileDumpManager: + """Strategy for dumping an entire profile.""" + + def __init__( + self, + config: DumpConfig, + dump_paths: DumpPaths, + dump_logger: DumpLogger, + detector: DumpChangeDetector, + process_manager: ProcessDumpManager, + current_mapping: GroupNodeMapping, + selected_group: orm.Group | None = None, + ) -> None: + self.selected_group = selected_group + self.config: DumpConfig = config + self.dump_paths: DumpPaths = dump_paths + self.process_manager: ProcessDumpManager = process_manager + self.detector: DumpChangeDetector = detector + self.current_mapping: GroupNodeMapping = current_mapping + self.dump_logger: DumpLogger = dump_logger + + def _register_group_and_prepare_path(self, group: orm.Group, group_path: Path) -> None: + """Ensure group exists in logger and return its path.""" + + if group.uuid not in self.dump_logger.groups.entries: + total_group_path = self.dump_paths.absolute / group_path + total_group_path.mkdir(exist_ok=True, parents=True) + (total_group_path / self.dump_paths.safeguard_file).touch() + + msg = f"Registering group '{group.label}' ({group.uuid}) in logger." + logger.debug(msg) + + group_store = self.dump_logger.groups + group_store.add_entry( + uuid=group.uuid, + entry=DumpLog(path=total_group_path), + ) + + def _determine_groups_to_process(self) -> list[Group]: + """Determine which groups to process based on config.""" + groups_to_process: list[Group] = [] + if self.config.profile_dump_selection == ProfileDumpSelection.ALL: + logger.info('Dumping all groups as requested by configuration.') + try: + qb_groups = QueryBuilder().append(orm.Group) + groups_to_process = qb_groups.all(flat=True) + except Exception as e: + logger.error(f'Failed to query groups for profile dump: {e}') + groups_to_process = [] + elif self.config.groups: + group_identifiers = self.config.groups + logger.info(f'Dumping specific groups: {group_identifiers}') + if group_identifiers: + if isinstance(group_identifiers[0], orm.Group): + groups_to_process = cast(list[Group], group_identifiers) + else: + try: + groups_to_process = [orm.load_group(identifier=str(gid)) for gid in group_identifiers] + except NotExistent as e: + logger.error(f'Error loading specified group: {e}. Aborting group processing.') + except Exception as e: + logger.error(f'Unexpected error loading groups: {e}. Aborting group processing.') + # NOTE: Tests failed bc of this + # groups_to_process = [] # Ensure it's empty on error + else: + logger.warning('Scope set to SPECIFIC but no group identifiers provided.') + + logger.info(f'Will process {len(groups_to_process)} groups found in the profile.') + return groups_to_process + + def _identify_nodes_for_group(self, group: Group, changes: DumpChanges) -> tuple[DumpNodeStore, list[WorkflowNode]]: + """Identify nodes explicitly belonging to a given group.""" + nodes_explicitly_in_group = DumpNodeStore() + group_node_uuids = self.current_mapping.group_to_nodes.get(group.uuid, set()) + workflows_explicitly_in_group: list[WorkflowNode] = [] + + for store_key in ['calculations', 'workflows', 'data']: + globally_detected_nodes = getattr(changes.nodes.new_or_modified, store_key, []) + filtered_nodes = [node for node in globally_detected_nodes if node.uuid in group_node_uuids] + if filtered_nodes: + setattr(nodes_explicitly_in_group, store_key, filtered_nodes) + if store_key == 'workflows': + workflows_explicitly_in_group.extend( + cast( + list[WorkflowNode], + [wf for wf in filtered_nodes if isinstance(wf, orm.WorkflowNode)], + ) + ) + return nodes_explicitly_in_group, workflows_explicitly_in_group + + def _identify_ungrouped_nodes(self, changes: DumpChanges) -> tuple[DumpNodeStore, list[WorkflowNode]]: + """Identify nodes detected globally that do not belong to any group.""" + ungrouped_nodes_store = DumpNodeStore() + ungrouped_workflows: list[WorkflowNode] = [] + + for store_key in ['calculations', 'workflows', 'data']: + store_nodes = getattr(changes.nodes.new_or_modified, store_key, []) + # Node is ungrouped if its UUID is not in the node_to_groups mapping + ungrouped = [node for node in store_nodes if node.uuid not in self.current_mapping.node_to_groups] + if ungrouped: + setattr(ungrouped_nodes_store, store_key, ungrouped) + if store_key == 'workflows': + ungrouped_workflows.extend( + cast( + list[WorkflowNode], + [wf for wf in ungrouped if isinstance(wf, orm.WorkflowNode)], + ) + ) + return ungrouped_nodes_store, ungrouped_workflows + + def _add_ungrouped_descendants_if_needed( + self, + ungrouped_nodes_store: DumpNodeStore, + ungrouped_workflows: list[WorkflowNode], + ) -> None: + """Add calculation descendants for ungrouped workflows if config requires.""" + if self.config.only_top_level_calcs or not ungrouped_workflows: + return + + logger.debug('Finding calculation descendants for ungrouped workflows (only_top_level_calcs=False)') + try: + descendants = DumpChangeDetector._get_calculation_descendants(ungrouped_workflows) + if descendants: + existing_calc_uuids = {calc.uuid for calc in ungrouped_nodes_store.calculations} + logged_calc_uuids = set(self.dump_logger.calculations.entries.keys()) + unique_descendants = [ + desc + for desc in descendants + if desc.uuid not in existing_calc_uuids and desc.uuid not in logged_calc_uuids + ] + if unique_descendants: + logger.debug(f'Adding {len(unique_descendants)} unique, unlogged descendants to ungrouped dump.') + if not hasattr(ungrouped_nodes_store, 'calculations') or ungrouped_nodes_store.calculations is None: + ungrouped_nodes_store.calculations = [] + ungrouped_nodes_store.calculations.extend(unique_descendants) + else: + logger.debug('All descendants for ungrouped workflows were already included or logged.') + except Exception as e: + logger.warning(f'Could not retrieve/process descendants for ungrouped workflows: {e}') + + def _process_ungrouped_nodes(self) -> None: + """Identify ALL currently ungrouped nodes (ignoring time filter), + apply necessary filters (like top-level), and ensure they are + represented in the dump if config.also_ungrouped is True and + they don't already have an ungrouped representation. + """ + if not self.config.also_ungrouped: + logger.info('Skipping ungrouped nodes processing (also_ungrouped=False).') + return + + logger.info('Processing ungrouped nodes (also_ungrouped=True)...') + + # 1. Determine the target path for ungrouped nodes + try: + ungrouped_path_relative = DumpPaths._get_group_path( + group=None, organize_by_groups=self.config.organize_by_groups + ) + ungrouped_path_absolute = self.dump_paths.absolute / ungrouped_path_relative + ungrouped_path_absolute.mkdir(exist_ok=True, parents=True) + logger.debug(f'Target path for ungrouped nodes: {ungrouped_path_absolute}') + except Exception as e: + logger.error(f'Failed to determine or create ungrouped path: {e}', exc_info=True) + return + + # 2. Use Node Query logic, ignoring time filter, to get initial candidates + logger.debug('Querying detector for ungrouped nodes with ignore_time_filter=True...') + try: + # Ensure self.detector and self.dump_times are accessible + # Query base Node type to get all potential candidates initially + initial_ungrouped_nodes: list[orm.Node] = self.detector.node_query._get_nodes( + orm_type=orm.Node, + dump_times=self.detector.dump_times, # Need access to dump_times + scope=GroupDumpScope.NO_GROUP, + ignore_time_filter=True, + ) + logger.debug( + f'Query returned {len(initial_ungrouped_nodes)} potential ungrouped nodes (ignoring time filter).' + ) + except AttributeError: + logger.error('Cannot access detector.node_query or detector.dump_times. Refactoring needed.') + return + except Exception as e: + logger.error(f'Query for ungrouped nodes failed: {e}', exc_info=True) + return + + # 3. Convert list to dictionary format required by filter methods + nodes_by_type: dict[str, list[orm.Node]] = { + 'calculations': [], + 'workflows': [], + 'data': [], # Include data if relevant + } + for node in initial_ungrouped_nodes: + try: + # Use isinstance for robust type checking + if isinstance(node, orm.CalculationNode): + nodes_by_type['calculations'].append(node) + elif isinstance(node, orm.WorkflowNode): + nodes_by_type['workflows'].append(node) + # Add elif for orm.Data if needed + # else: logger.debug(f"Node {node.pk} is not Calc/Work, ignoring for now.") + except Exception as e: + logger.warning(f'Error classifying node {node.pk} by type: {e}') + + # 4. Apply the Top-Level Filter (reuse detector's logic) + logger.debug('Applying top-level filter to ungrouped nodes...') + try: + # Pass the dictionary grouped by type to the filter + filtered_ungrouped_nodes_by_type = self.detector._apply_top_level_filter(nodes_by_type) + wf_count = len(filtered_ungrouped_nodes_by_type.get('workflows', [])) + calc_count = len(filtered_ungrouped_nodes_by_type.get('calculations', [])) + logger.debug(f'After top-level filter: {wf_count} workflows, {calc_count} calculations remain.') + except AttributeError: + logger.error('Cannot access detector._apply_top_level_filter. Refactoring needed.') + return + except Exception as e: + logger.error(f'Applying top-level filter failed: {e}', exc_info=True) + return + + nodes_to_dump_ungrouped = DumpNodeStore() + nodes_processed_count = 0 + + # 5. Check remaining nodes against logger for existing UNGROUPED representation + # Iterate through the dictionary returned by the filter + for store_key, node_list in filtered_ungrouped_nodes_by_type.items(): + if store_key not in ['calculations', 'workflows', 'data']: # Ensure valid store key + continue + for node in node_list: + node_uuid = node.uuid + log_entry = self.dump_logger.get_store_by_uuid(node_uuid) + + has_ungrouped_representation = False + if log_entry: + try: + primary_path = log_entry.path + # Check if primary_path exists before resolving + if ( + primary_path + and primary_path.exists() + and primary_path.resolve().is_relative_to(ungrouped_path_absolute.resolve()) + ): + has_ungrouped_representation = True + + if not has_ungrouped_representation: + for symlink_path in log_entry.symlinks: + if symlink_path.exists() and symlink_path.resolve().is_relative_to( + ungrouped_path_absolute.resolve() + ): + has_ungrouped_representation = True + break + if not has_ungrouped_representation: + for duplicate_path in log_entry.duplicates: + if duplicate_path.exists() and duplicate_path.resolve().is_relative_to( + ungrouped_path_absolute.resolve() + ): + has_ungrouped_representation = True + break + except (OSError, ValueError, AttributeError) as e: + logger.warning(f'Error resolving/checking paths for logged node {node_uuid}: {e}') + except Exception as e: + logger.error(f'Unexpected error checking paths for logged node {node_uuid}: {e}', exc_info=True) + + # 6. Schedule dump if needed + if not has_ungrouped_representation: + msg = ( + f'Ungrouped node {node_uuid} (passed filters) lacks representation under ' + "'{ungrouped_path_relative}'. Scheduling." + ) + logger.debug(msg) + # Add to the correct list within nodes_to_dump_ungrouped + getattr(nodes_to_dump_ungrouped, store_key).append(node) + + # 7. Dump the collected nodes + if len(nodes_to_dump_ungrouped) > 0: + logger.report(f'Dumping/linking {len(nodes_to_dump_ungrouped)} nodes under ungrouped path...') + try: + (ungrouped_path_absolute / self.dump_paths.safeguard_file).touch(exist_ok=True) + self._dump_nodes(nodes_to_dump_ungrouped, group=None) + nodes_processed_count = len(nodes_to_dump_ungrouped) + except Exception as e: + logger.error(f'Failed processing nodes under ungrouped path: {e}', exc_info=True) + else: + logger.info('No ungrouped nodes required a new representation in the dump after applying filters.') + + logger.info(f'Finished processing ungrouped nodes. Processed {nodes_processed_count} nodes.') + + def _update_group_stats(self) -> None: + """Calculate and update final directory stats for all logged groups.""" + logger.info('Calculating final directory stats for all registered groups...') + for group_uuid, group_log_entry in self.dump_logger.groups.entries.items(): + group_path = group_log_entry.path + if not group_path.is_absolute(): + try: + group_path = self.dump_logger.dump_paths.parent / group_path + logger.debug(f'Resolved relative group path for {group_uuid} to {group_path}') + except Exception as path_e: + logger.error(f'Failed to resolve relative path for group {group_uuid}: {path_e}') + continue + + if not group_path.is_dir(): + logger.warning(f'Group path {group_path} for UUID {group_uuid} is not a directory. Skipping stats.') + continue + + logger.debug(f'Calculating stats for group directory: {group_path} (UUID: {group_uuid})') + try: + dir_mtime, dir_size = DumpPaths._get_directory_stats(group_path) + group_log_entry.dir_mtime = dir_mtime + group_log_entry.dir_size = dir_size + logger.debug(f'Updated stats for group {group_uuid}: mtime={dir_mtime}, size={dir_size}') + except Exception as e: + logger.error(f'Failed to calculate/update stats for group {group_uuid} at {group_path}: {e}') + + def _add_and_dump_group_descendants_if_needed( + self, + group: Group, + workflows_in_group: list[WorkflowNode], + ) -> None: + """ + Finds calculation descendants for the group's workflows (if config requires) + and immediately triggers their dump within the group context. + """ + if self.config.only_top_level_calcs or not workflows_in_group: + return # Skip if only dumping top-level or no workflows in group + + msg = ( + 'Finding and dumping calculation descendants for workflows in group' + "'{group.label}' (only_top_level_calcs=False)" + ) + logger.debug(msg) + try: + descendants = DumpChangeDetector._get_calculation_descendants(workflows_in_group) + if not descendants: + logger.debug(f"No calculation descendants found for workflows in group '{group.label}'.") + return + + logged_calc_uuids = set(self.dump_logger.calculations.entries.keys()) + + # Identify descendants that are not already logged (dumped previously) + unique_unlogged_descendants = [desc for desc in descendants if desc.uuid not in logged_calc_uuids] + + if unique_unlogged_descendants: + msg = ( + f'Immediately dumping {len(unique_unlogged_descendants)} unique, ' + "unlogged calculation descendants for group '{group.label}'." + ) + logger.info(msg) + # Create a temporary store just for these descendants + descendant_store = DumpNodeStore(calculations=unique_unlogged_descendants) + # Call node manager to dump these *now* within the group context + # This ensures they are created directly under .../calculations/ + self._dump_nodes(descendant_store, group=group) + # They will now be in the logger before the main workflow dump encounters them nested. + else: + logger.debug(f"All descendants for group '{group.label}' were already logged.") + except Exception as e: + logger.warning( + f"Could not retrieve/process/dump descendants for group '{group.label}': {e}", + exc_info=True, + ) + + def _process_group(self, group: Group, changes: DumpChanges) -> None: + """Process a single group: find nodes, dump descendants, dump explicit nodes.""" + logger.debug(f'Processing group: {group.label} ({group.uuid})') + group_path_relative = DumpPaths._get_group_path(group=group, organize_by_groups=self.config.organize_by_groups) + group_path_absolute = self.dump_paths.absolute / group_path_relative + + try: + self._register_group_and_prepare_path( + group=group, + group_path=group_path_absolute, + ) + except Exception as e: + logger.error(f'Failed to handle group path for {group.label}: {e}') + return + + # 1. Identify nodes explicitly in this group from the global changes + # We mainly care about the workflows here for the descendant logic. + nodes_explicitly_in_group, workflows_in_group = self._identify_nodes_for_group(group, changes) + + # 2. Find and *immediately* dump calculation descendants if required + # This ensures the direct dump under ".../calculations/" happens first. + self._add_and_dump_group_descendants_if_needed(group, workflows_in_group) + + # 3. Dump the nodes explicitly identified for this group + # (This will primarily be the workflows now, potentially some top-level calcs) + # Filter out calculations already dumped in step 2 to avoid redundant primary dump calls. + explicit_calcs_in_group = nodes_explicitly_in_group.calculations + explicit_workflows_in_group = nodes_explicitly_in_group.workflows + explicit_data_in_group = nodes_explicitly_in_group.data # Assuming data handling exists + + # Create final store for this step (mainly workflows + maybe top-level calcs/data) + store_for_explicit_dump = DumpNodeStore( + calculations=explicit_calcs_in_group, # Keep explicit calcs if any + workflows=explicit_workflows_in_group, + data=explicit_data_in_group, + ) + + if store_for_explicit_dump and len(store_for_explicit_dump) > 0: + logger.info(f"Dumping {len(store_for_explicit_dump)} explicitly identified nodes for group '{group.label}'") + try: + # Dump these nodes (e.g., the Workflow) in the group context + self._dump_nodes(store_for_explicit_dump, group=group) + # When the workflow dump recurses, it will find descendants already logged + # and trigger DUMP_DUPLICATE or SYMLINK correctly. + except Exception as e: + logger.error( + f"Error dumping explicitly identified nodes for group '{group.label}': {e}", + exc_info=True, + ) + else: + logger.debug( + f"No further explicitly identified nodes to dump in group '{group.label}' after handling descendants." + ) + + def _handle_group_changes(self, group_changes: GroupChanges): + """Handle changes in the group structure since the last dump. + + Apart from membership changes in the group-node mapping, this doesn't do much else actual work, as the actual + dumping and deleting are handled in other places. + + :param group_changes: _description_ + """ + logger.report('Processing group changes...') + + # 1. Handle Deleted Groups (Directory deletion handled by DeletionManager) + # We might still need to log this or perform other cleanup + if group_changes.deleted: + group_labels = [group_info.label for group_info in group_changes.deleted] + msg = f'Detected {len(group_changes.deleted)} deleted groups.' + logger.report(msg) + + # 2. Handle New Groups + if group_changes.new: + group_labels = [group_info.label for group_info in group_changes.new] + logger.report(f'Processing {len(group_changes.new)} new or modified groups: {group_labels}') + for group_info in group_changes.new: + # Ensure the group directory exists and is logged + try: + group = orm.load_group(uuid=group_info.uuid) + # Avoid creating empty directories for empty groups + if not group.nodes: + continue + # Avoid creating empty directories for deselected groups + if self.config.groups and ( + group.label not in self.config.groups or group_info.uuid not in self.config.groups + ): + continue + group_path = DumpPaths._get_group_path( + group=group, organize_by_groups=self.config.organize_by_groups + ) + self._register_group_and_prepare_path( + group=group, + group_path=group_path, + ) + # Dumping nodes within this new group will happen if they + # are picked up by the NodeChanges detection based on the config. + except Exception as e: + logger.warning(f'Could not process new group {group_info.uuid}: {e}') + + # --- Handle Renamed Groups --- + if self.config.update_groups and group_changes.renamed: + logger.report(f'Processing {len(group_changes.renamed)} renamed groups...') + for rename_info in group_changes.renamed: + old_path = rename_info.old_path + new_path = rename_info.new_path + logger.info(f"Handling rename for group {rename_info.uuid}: '{old_path.name}' -> '{new_path.name}'") + + # 1. Rename directory on disk + if old_path.exists(): + try: + # Ensure parent of new path exists + new_path.parent.mkdir(parents=True, exist_ok=True) + # os.rename is generally preferred for atomic rename if possible + os.rename(old_path, new_path) + logger.debug(f"Renamed directory '{old_path}' to '{new_path}'") + except OSError as e: + logger.error(f'Failed to rename directory for group {rename_info.uuid}: {e}', exc_info=True) + # Decide whether to continue trying to update logger + continue # Skip logger update if rename failed + else: + logger.warning(f"Old path '{old_path}' for renamed group {rename_info.uuid} not found on disk.") + # Still attempt logger update, as the log might be inconsistent + + # 2. Update logger paths + try: + # Call the refined update_paths method + self.dump_logger.update_paths(old_base_path=old_path, new_base_path=new_path) + except Exception as e: + logger.error( + f'Failed to update logger paths for renamed group {rename_info.uuid}: {e}', exc_info=True + ) + + # --- Handle Modified Groups (Membership changes) --- + if group_changes.modified: + group_labels = [group_info.label for group_info in group_changes.modified] + logger.report(f'Processing {len(group_changes.modified)} modified groups (membership): {group_labels}') + for mod_info in group_changes.modified: + # Ensure group path exists (might have been renamed above) + try: + current_group = orm.load_group(uuid=mod_info.uuid) + current_group_path_rel = DumpPaths._get_group_path(current_group, self.config.organize_by_groups) + current_group_path_abs = self.dump_paths.absolute / current_group_path_rel + # Ensure path exists in logger and on disk after potential rename + self._register_group_and_prepare_path(current_group, current_group_path_abs) + # Pass the *current* absolute path to _update_group_membership + self._update_group_membership(mod_info, current_group_path_abs) + except Exception as e: + logger.error(f'Cannot prepare path/update membership for modified group {mod_info.uuid}: {e}') + + def _update_group_membership(self, mod_info: GroupModificationInfo, current_group_path_abs: Path) -> None: + """Update dump structure for a group with added/removed nodes.""" + # (Make sure this method now receives the correct, potentially *new*, absolute group path) + msg = ( + f'Updating group membership {mod_info.label}: {len(mod_info.nodes_added)} added, ' + f'{len(mod_info.nodes_removed)} removed.' + ) + logger.debug(msg) # Changed level to debug as it's less critical than rename itself + + try: + group = orm.load_group(uuid=mod_info.uuid) + except Exception as e: + logger.error(f'Cannot load group {mod_info.uuid} for membership update: {e}') + return + + # Node addition handling remains the same - process manager places it correctly + for node_uuid in mod_info.nodes_added: + try: + node = orm.load_node(uuid=node_uuid) + logger.debug(f"Node {node_uuid} added to group {group.label}. Ensuring it's dumped/linked.") + # Process manager dump will handle placing it under the correct *current* group path + self.process_manager.dump(process_node=node, group=group) + except Exception as e: + logger.warning(f'Could not process node {node_uuid} added to group {group.label}: {e}') + + # Node removal handling uses the passed current_group_path_abs + if self.config.organize_by_groups and mod_info.nodes_removed: + logger.debug(f"Handling {len(mod_info.nodes_removed)} nodes removed from group '{group.label}'") + for node_uuid in mod_info.nodes_removed: + # Pass the correct current path for cleanup + self._remove_node_from_group_dir(current_group_path_abs, node_uuid) + + def _remove_node_from_group_dir(self, group_path: Path, node_uuid: str): + """ + Find and remove a node's dump dir/symlink within a specific group path. + Handles nodes potentially deleted from the DB by checking filesystem paths. + """ + node_path_in_logger = self.dump_logger.get_dump_path_by_uuid(node_uuid) + # store = self.dump_logger.get_store_by_uuid(node_uuid) + if not node_path_in_logger: + logger.warning(f'Cannot find logger path for node {node_uuid} to remove from group.') + return + + # Even if node is deleted from DB, we expect the dump_logger to know the original path name + node_filename = node_path_in_logger.name + + # Construct potential paths within the group dir where the node might be represented + # The order matters if duplicates could somehow exist; checks stop on first find. + possible_paths_to_check = [ + group_path / 'calculations' / node_filename, + group_path / 'workflows' / node_filename, + group_path / node_filename, # Check group root last + ] + + found_path: Path | None = None + for potential_path in possible_paths_to_check: + # Use exists() which works for files, dirs, and symlinks (even broken ones) + if potential_path.exists(): + found_path = potential_path + logger.debug(f'Found existing path for node {node_uuid} representation at: {found_path}') + break # Stop searching once a potential candidate is found + + if not found_path: + logger.debug( + f"Node {node_uuid} representation ('{node_filename}') not found in standard " + f"group locations within '{group_path.name}'. No removal needed." + ) + return + + # --- Removal Logic applied to the found_path --- + try: + # Determine if the found path IS the original logged path. + # This is crucial to avoid deleting the source if it was stored directly in the group path. + is_target_dir = False + try: + # Use resolve() for robust comparison, handles symlinks, '.', '..' etc. + # This comparison is only meaningful if the original logged path *still exists*. + # If node_path_in_logger points to a non-existent location, found_path cannot be it. + if node_path_in_logger.exists(): + # Resolving might fail if permissions are wrong, hence the inner try/except + is_target_dir = found_path.resolve() == node_path_in_logger.resolve() + except OSError as e: + # Error resolving paths, cannot be certain it's not the target. Err on safe side. + logger.error( + f'Error resolving path {found_path} or {node_path_in_logger}: {e}. ' + f"Cannot safely determine if it's the target directory. Skipping removal." + ) + return + + log_suffix = f" from group directory '{group_path.name}'" + + # Proceed with removal based on what found_path is + if found_path.is_symlink(): + logger.info(f"Removing symlink '{found_path.name}'{log_suffix}.") + try: + # Unlink works even if the symlink target doesn't exist + found_path.unlink() + # TODO: Remove symlink from logger + self.dump_logger.remove_symlink_from_log_entry(node_uuid, found_path) + # store.remove_symlink(found_path) + except OSError as e: + logger.error(f'Failed to remove symlink {found_path}: {e}') + + elif found_path.is_dir() and not is_target_dir: + # It's a directory *within* the group structure (likely a copy), and NOT the original. Safe to remove. + logger.info(f"Removing directory '{found_path.name}'{log_suffix}.") + try: + # Ensure safe_delete_dir handles non-empty dirs and potential errors + DumpPaths._safe_delete_dir(found_path, safeguard_file=DumpPaths.safeguard_file) + except Exception as e: + logger.error(f'Failed to safely delete directory {found_path}: {e}') + + elif is_target_dir: + # The path found *is* the primary logged path. + # Removing the node from a group shouldn't delete its primary data here. + logger.debug( + f'Node {node_uuid} representation found at {found_path} is the primary dump path. ' + f'It is intentionally not deleted by this operation.' + ) + else: + # Exists, but isn't a symlink, and isn't a directory that's safe to remove + # (e.g., it's a file, or is_target_dir was True but it wasn't a dir?) + logger.warning( + f'Path {found_path} exists but is not a symlink or a directory designated ' + f'for removal in this context (is_dir={found_path.is_dir()}, is_target_dir={is_target_dir}). ' + 'Skipping removal.' + ) + + except Exception as e: + # Catch unexpected errors during the removal logic + logger.exception( + f'An unexpected error occurred while processing path {found_path} for node {node_uuid}: {e}' + ) + + def _dump_nodes(self, node_store: DumpNodeStore, group: orm.Group | None = None): + """Dump a collection of nodes from a node store within an optional group context. + + :param node_store: _description_ + :param group: _description_, defaults to None + """ + set_progress_bar_tqdm() + nodes_to_dump = [] + nodes_to_dump.extend(node_store.calculations) + nodes_to_dump.extend(node_store.workflows) + if not nodes_to_dump: + return + desc = f'Dumping {len(nodes_to_dump)} nodes' + if group: + desc += f" for group '{group.label}'" + logger.report(desc) + with get_progress_reporter()(desc=desc, total=len(nodes_to_dump)) as progress: + for node in nodes_to_dump: + try: + # Call the main entry point for dumping a single process + self.process_manager.dump(process_node=node, group=group) + except Exception as e: + logger.error( + f'Failed preparing/dumping node PK={node.pk}: {e}', + exc_info=True, + ) + finally: + progress.update() + + def dump(self, changes: DumpChanges) -> None: + """Dumps the entire profile by orchestrating helper methods.""" + logger.info('Executing ProfileDumpManager') + if self.config.profile_dump_selection == ProfileDumpSelection.NONE: + logger.report('Default profile dump scope is NONE, skipping profile content dump.') + return + + # 1. Handle Group Lifecycle (using Group Manager) + # This applies changes detected earlier (new/deleted groups, membership) + # Directory creation/deletion for groups happens here or in DeletionManager + logger.info('Processing group lifecycle and membership changes...') + self._handle_group_changes(changes.groups) + + # 2. Determine which groups need node processing based on config + # (e.g., all groups, specific groups) + groups_to_process = self._determine_groups_to_process() + + # 3. Process nodes within each selected group + logger.info('Processing nodes within groups...') + for group in groups_to_process: + # _process_group handles finding nodes for this group, + # adding descendants if needed, and calling node_manager.dump_nodes + self._process_group(group, changes) + + # 4. Process ungrouped nodes if requested by config + # _process_ungrouped_nodes finds relevant nodes and calls node_manager.dump_nodes + self._process_ungrouped_nodes() + + # 5. Update final stats for logged groups after all dumping is done + self._update_group_stats() + + logger.info('Finished ProfileDumpManager.') diff --git a/src/aiida/tools/dumping/mapping.py b/src/aiida/tools/dumping/mapping.py new file mode 100644 index 0000000000..991d17bb30 --- /dev/null +++ b/src/aiida/tools/dumping/mapping.py @@ -0,0 +1,212 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### + +"""GroupNodeMapping to handle group-node relationships.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Dict, Set + +from aiida import orm +from aiida.tools.dumping.utils.helpers import GroupChanges, GroupInfo, GroupModificationInfo, NodeMembershipChange + + +@dataclass +class GroupNodeMapping: + """Stores the mapping between groups and their member nodes.""" + + # Map of group UUID to set of node UUIDs + group_to_nodes: Dict[str, Set[str]] = field(default_factory=dict) + + # Map of node UUID to set of group UUIDs (for faster lookups) + node_to_groups: Dict[str, Set[str]] = field(default_factory=dict) + + def _add_node_to_group(self, group_uuid: str, node_uuid: str) -> None: + """Add a node to a group in the mapping.""" + # Add to group->nodes mapping + if group_uuid not in self.group_to_nodes: + self.group_to_nodes[group_uuid] = set() + self.group_to_nodes[group_uuid].add(node_uuid) + + # Add to node->groups mapping + if node_uuid not in self.node_to_groups: + self.node_to_groups[node_uuid] = set() + self.node_to_groups[node_uuid].add(group_uuid) + + def _remove_node_from_group(self, group_uuid: str, node_uuid: str) -> None: + """Remove a node from a group in the mapping.""" + # Remove from group->nodes mapping + if group_uuid in self.group_to_nodes and node_uuid in self.group_to_nodes[group_uuid]: + self.group_to_nodes[group_uuid].remove(node_uuid) + # Clean up empty entries + if not self.group_to_nodes[group_uuid]: + del self.group_to_nodes[group_uuid] + + # Remove from node->groups mapping + if node_uuid in self.node_to_groups and group_uuid in self.node_to_groups[node_uuid]: + self.node_to_groups[node_uuid].remove(group_uuid) + # Clean up empty entries + if not self.node_to_groups[node_uuid]: + del self.node_to_groups[node_uuid] + + def _remove_group(self, group_uuid: str) -> None: + """Remove a group and all its node associations.""" + if group_uuid not in self.group_to_nodes: + return + + # Get all nodes in this group + nodes = self.group_to_nodes[group_uuid].copy() + + # Remove group from each node's groups + for node_uuid in nodes: + if node_uuid in self.node_to_groups: + if group_uuid in self.node_to_groups[node_uuid]: + self.node_to_groups[node_uuid].remove(group_uuid) + # Clean up empty entries + if not self.node_to_groups[node_uuid]: + del self.node_to_groups[node_uuid] + + # Remove the group entry + del self.group_to_nodes[group_uuid] + + def to_dict(self) -> Dict: + """Convert to serializable dictionary.""" + return { + 'group_to_nodes': {group_uuid: list(node_uuids) for group_uuid, node_uuids in self.group_to_nodes.items()}, + 'node_to_groups': {node_uuid: list(group_uuids) for node_uuid, group_uuids in self.node_to_groups.items()}, + } + + @classmethod + def from_dict(cls, data: Dict) -> 'GroupNodeMapping': + """Create from serialized dictionary.""" + mapping = cls() + + # Handle old format (backward compatibility) + if 'group_to_nodes' in data and isinstance(data['group_to_nodes'], dict): + for group_uuid, node_uuids in data['group_to_nodes'].items(): + for node_uuid in node_uuids: + mapping._add_node_to_group(group_uuid, node_uuid) + + # Handle new format with both mappings + elif isinstance(data, dict): + # Load group_to_nodes mapping if present + if 'group_to_nodes' in data and isinstance(data['group_to_nodes'], dict): + for group_uuid, node_uuids in data['group_to_nodes'].items(): + for node_uuid in node_uuids: + # Just add to the group_to_nodes mapping, we'll rebuild node_to_groups after + if group_uuid not in mapping.group_to_nodes: + mapping.group_to_nodes[group_uuid] = set() + mapping.group_to_nodes[group_uuid].add(node_uuid) + + # Load node_to_groups mapping if present (or rebuild it) + if 'node_to_groups' in data and isinstance(data['node_to_groups'], dict): + for node_uuid, group_uuids in data['node_to_groups'].items(): + for group_uuid in group_uuids: + if node_uuid not in mapping.node_to_groups: + mapping.node_to_groups[node_uuid] = set() + mapping.node_to_groups[node_uuid].add(group_uuid) + else: + # If node_to_groups is missing, rebuild it from group_to_nodes + for group_uuid, node_uuids in mapping.group_to_nodes.items(): + for node_uuid in node_uuids: + if node_uuid not in mapping.node_to_groups: + mapping.node_to_groups[node_uuid] = set() + mapping.node_to_groups[node_uuid].add(group_uuid) + + return mapping + + @classmethod + def build_from_db(cls) -> 'GroupNodeMapping': + """Build a mapping from the current database state.""" + mapping = cls() + + # Query all groups and their nodes + qb = orm.QueryBuilder() + qb.append(orm.Group, tag='group', project=['uuid']) + qb.append(orm.Node, with_group='group', project=['uuid']) + + for group_uuid, node_uuid in qb.all(): + mapping._add_node_to_group(group_uuid, node_uuid) + + return mapping + + def diff(self, other: 'GroupNodeMapping') -> GroupChanges: + """ + Calculate differences between this mapping and another. + + Returns: + GroupChangeInfo object with detailed group changes. + """ + # TODO: Seems like when nodes added to a group, this group is still presented as "new" + deleted_groups_info = [] + new_groups_info = [] + modified_groups_info = [] + node_membership_changes = {} + + self_group_uuids = set(self.group_to_nodes.keys()) + other_group_uuids = set(other.group_to_nodes.keys()) + + # Deleted groups + for group_uuid in self_group_uuids - other_group_uuids: + deleted_groups_info.append( + GroupInfo( + uuid=group_uuid, + node_count=len(self.group_to_nodes.get(group_uuid, set())), + # as group deleted, cannot be loaded from DB + # label=orm.load_group(group_uuid).label + ) + ) + + # New groups + for group_uuid in other_group_uuids - self_group_uuids: + new_groups_info.append( + GroupInfo( + uuid=group_uuid, + node_count=len(other.group_to_nodes.get(group_uuid, set())), + label=orm.load_group(group_uuid).label, + ) + ) + + # Modified groups + for group_uuid in self_group_uuids & other_group_uuids: + self_nodes = self.group_to_nodes.get(group_uuid, set()) + other_nodes = other.group_to_nodes.get(group_uuid, set()) + + added_nodes = list(other_nodes - self_nodes) + removed_nodes = list(self_nodes - other_nodes) + + if added_nodes or removed_nodes: + modified_groups_info.append( + GroupModificationInfo( + uuid=group_uuid, + label=orm.load_group(group_uuid).label, + nodes_added=added_nodes, + nodes_removed=removed_nodes, + ) + ) + + # Track detailed node membership changes + for node_uuid in added_nodes: + if node_uuid not in node_membership_changes: + node_membership_changes[node_uuid] = NodeMembershipChange() + node_membership_changes[node_uuid].added_to.append(group_uuid) + + for node_uuid in removed_nodes: + if node_uuid not in node_membership_changes: + node_membership_changes[node_uuid] = NodeMembershipChange() + node_membership_changes[node_uuid].removed_from.append(group_uuid) + + # Construct and return the GroupChangeInfo object + return GroupChanges( + deleted=deleted_groups_info, + new=new_groups_info, + modified=modified_groups_info, + node_membership=node_membership_changes, + ) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py deleted file mode 100644 index 794b1fcab2..0000000000 --- a/src/aiida/tools/dumping/processes.py +++ /dev/null @@ -1,426 +0,0 @@ -########################################################################### -# Copyright (c), The AiiDA team. All rights reserved. # -# This file is part of the AiiDA code. # -# # -# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # -# For further information on the license, see the LICENSE.txt file # -# For further information please visit http://www.aiida.net # -########################################################################### -"""Functionality for dumping of ProcessNodes.""" - -from __future__ import annotations - -import logging -from pathlib import Path -from types import SimpleNamespace -from typing import List - -import yaml - -from aiida.common import LinkType -from aiida.common.exceptions import NotExistentAttributeError -from aiida.orm import ( - CalcFunctionNode, - CalcJobNode, - CalculationNode, - LinkManager, - ProcessNode, - WorkChainNode, - WorkflowNode, - WorkFunctionNode, -) -from aiida.orm.utils import LinkTriple -from aiida.tools.archive.exceptions import ExportValidationError -from aiida.tools.dumping.utils import prepare_dump_path - -LOGGER = logging.getLogger(__name__) - - -class ProcessDumper: - def __init__( - self, - include_inputs: bool = True, - include_outputs: bool = False, - include_attributes: bool = True, - include_extras: bool = True, - overwrite: bool = False, - flat: bool = False, - dump_unsealed: bool = False, - incremental: bool = True, - ) -> None: - self.include_inputs = include_inputs - self.include_outputs = include_outputs - self.include_attributes = include_attributes - self.include_extras = include_extras - self.overwrite = overwrite - self.flat = flat - self.dump_unsealed = dump_unsealed - self.incremental = incremental - - @staticmethod - def _generate_default_dump_path(process_node: ProcessNode) -> Path: - """Simple helper function to generate the default parent-dumping directory if none given. - - This function is not called for the recursive sub-calls of `_dump_calculation` as it just creates the default - parent folder for the dumping, if no name is given. - - :param process_node: The `ProcessNode` for which the directory is created. - :return: The absolute default parent dump path. - """ - - pk = process_node.pk - try: - return Path(f'dump-{process_node.process_label}-{pk}') - except AttributeError: - # This case came up during testing, not sure how relevant it actually is - return Path(f'dump-{process_node.process_type}-{pk}') - - @staticmethod - def _generate_readme(process_node: ProcessNode, output_path: Path) -> None: - """Generate README.md file in main dumping directory. - - :param process_node: `CalculationNode` or `WorkflowNode`. - :param output_path: Output path for dumping. - - """ - - import textwrap - - from aiida.cmdline.utils.ascii_vis import format_call_graph - from aiida.cmdline.utils.common import ( - get_calcjob_report, - get_node_info, - get_process_function_report, - get_workchain_report, - ) - - pk = process_node.pk - - _readme_string = textwrap.dedent( - f"""\ - This directory contains the files involved in the calculation/workflow - `{process_node.process_label} <{pk}>` run with AiiDA. - - Child calculations/workflows (also called `CalcJob`s/`CalcFunction`s and `WorkChain`s/`WorkFunction`s in AiiDA - jargon) run by the parent workflow are contained in the directory tree as sub-folders and are sorted by their - creation time. The directory tree thus mirrors the logical execution of the workflow, which can also be queried - by running `verdi process status {pk}` on the command line. - - By default, input and output files of each calculation can be found in the corresponding "inputs" and "outputs" - directories (the former also contains the hidden ".aiida" folder with machine-readable job execution settings). - Additional input and output files (depending on the type of calculation) are placed in the "node_inputs" and - "node_outputs", respectively. - - Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant - AiiDA node data for further inspection.""" - ) - - # `verdi process status` - process_status = format_call_graph(calc_node=process_node, max_depth=None, call_link_label=True) - _readme_string += f'\n\n\nOutput of `verdi process status {pk}`:\n\n```shell\n{process_status}\n```' - - # `verdi process report` - # Copied over from `cmd_process` - if isinstance(process_node, CalcJobNode): - process_report = get_calcjob_report(process_node) - elif isinstance(process_node, WorkChainNode): - process_report = get_workchain_report(process_node, levelname='REPORT', indent_size=2, max_depth=None) - elif isinstance(process_node, (CalcFunctionNode, WorkFunctionNode)): - process_report = get_process_function_report(process_node) - else: - process_report = f'Nothing to show for node type {process_node.__class__}' - - _readme_string += f'\n\n\nOutput of `verdi process report {pk}`:\n\n```shell\n{process_report}\n```' - - # `verdi process show`? - process_show = get_node_info(node=process_node) - _readme_string += f'\n\n\nOutput of `verdi process show {pk}`:\n\n```shell\n{process_show}\n```' - - (output_path / 'README.md').write_text(_readme_string) - - @staticmethod - def _generate_child_node_label(index: int, link_triple: LinkTriple) -> str: - """Small helper function to generate and clean directory label for child nodes during recursion. - - :param index: Index assigned to step at current level of recursion. - :param link_triple: `LinkTriple` of `ProcessNode` explored during recursion. - :return: Chlild node label during recursion. - """ - node = link_triple.node - link_label = link_triple.link_label - - # Generate directories with naming scheme akin to `verdi process status` - label_list = [f'{index:02d}', link_label] - - try: - process_label = node.process_label - if process_label is not None and process_label != link_label: - label_list += [process_label] - - except AttributeError: - process_type = node.process_type - if process_type is not None and process_type != link_label: - label_list += [process_type] - - node_label = '-'.join(label_list) - # `CALL-` as part of the link labels also for MultiplyAddWorkChain -> Seems general enough, so remove - node_label = node_label.replace('CALL-', '') - node_label = node_label.replace('None-', '') - - return node_label - - def dump( - self, - process_node: ProcessNode, - output_path: Path | None, - io_dump_paths: List[str | Path] | None = None, - ) -> Path: - """Dumps all data involved in a `ProcessNode`, including its outgoing links. - - Note that if an outgoing link is a `WorkflowNode`, the function recursively calls itself, while files are - only actually created when a `CalculationNode` is reached. - - :param process_node: The parent `ProcessNode` node to be dumped. - :param output_path: The output path where the directory tree will be created. - :param io_dump_paths: Subdirectories created for each `CalculationNode`. - Default: ['inputs', 'outputs', 'node_inputs', 'node_outputs'] - :raises: ExportValidationError if the node is not sealed and dump_unsealed is False. - """ - - if not process_node.is_sealed and not self.dump_unsealed: - raise ExportValidationError( - f'Process `{process_node.pk} must be sealed before it can be dumped, or `dump_unsealed` set to True.' - ) - - if output_path is None: - output_path = self._generate_default_dump_path(process_node=process_node) - - prepare_dump_path(path_to_validate=output_path, overwrite=self.overwrite, incremental=self.incremental) - - if isinstance(process_node, CalculationNode): - self._dump_calculation( - calculation_node=process_node, - output_path=output_path, - io_dump_paths=io_dump_paths, - ) - - elif isinstance(process_node, WorkflowNode): - self._dump_workflow( - workflow_node=process_node, - output_path=output_path, - io_dump_paths=io_dump_paths, - ) - - self._generate_readme(process_node=process_node, output_path=output_path) - - return output_path - - def _dump_workflow( - self, workflow_node: WorkflowNode, output_path: Path, io_dump_paths: List[str | Path] | None = None - ) -> None: - """Recursive function to traverse a `WorkflowNode` and dump its `CalculationNode` s. - - :param workflow_node: `WorkflowNode` to be traversed. Will be updated during recursion. - :param output_path: Dumping parent directory. Will be updated during recursion. - :param io_dump_paths: Custom subdirectories for `CalculationNode` s, defaults to None - """ - - prepare_dump_path(path_to_validate=output_path, overwrite=self.overwrite, incremental=self.incremental) - self._dump_node_yaml(process_node=workflow_node, output_path=output_path) - - called_links = workflow_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() - called_links = sorted(called_links, key=lambda link_triple: link_triple.node.ctime) - - for index, link_triple in enumerate(called_links, start=1): - child_node = link_triple.node - child_label = self._generate_child_node_label(index=index, link_triple=link_triple) - child_output_path = output_path.resolve() / child_label - - # Recursive function call for `WorkFlowNode` - if isinstance(child_node, WorkflowNode): - self._dump_workflow( - workflow_node=child_node, - output_path=child_output_path, - io_dump_paths=io_dump_paths, - ) - - # Once a `CalculationNode` as child reached, dump it - elif isinstance(child_node, CalculationNode): - self._dump_calculation( - calculation_node=child_node, - output_path=child_output_path, - io_dump_paths=io_dump_paths, - ) - - def _dump_calculation( - self, - calculation_node: CalculationNode, - output_path: Path, - io_dump_paths: List[str | Path] | None = None, - ) -> None: - """Dump the contents of a `CalculationNode` to a specified output path. - - :param calculation_node: The `CalculationNode` to be dumped. - :param output_path: The path where the files will be dumped. - :param io_dump_paths: Subdirectories created for the `CalculationNode`. - Default: ['inputs', 'outputs', 'node_inputs', 'node_outputs'] - """ - - prepare_dump_path(path_to_validate=output_path, overwrite=self.overwrite, incremental=self.incremental) - self._dump_node_yaml(process_node=calculation_node, output_path=output_path) - - io_dump_mapping = self._generate_calculation_io_mapping(io_dump_paths=io_dump_paths) - - # Dump the repository contents of the node - calculation_node.base.repository.copy_tree(output_path.resolve() / io_dump_mapping.repository) - - # Dump the repository contents of `outputs.retrieved` - try: - calculation_node.outputs.retrieved.base.repository.copy_tree( - output_path.resolve() / io_dump_mapping.retrieved - ) - except NotExistentAttributeError: - pass - - # Dump the node_inputs - if self.include_inputs: - input_links = calculation_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC) - self._dump_calculation_io(parent_path=output_path / io_dump_mapping.inputs, link_triples=input_links) - - # Dump the node_outputs apart from `retrieved` - if self.include_outputs: - output_links = list(calculation_node.base.links.get_outgoing(link_type=LinkType.CREATE)) - output_links = [output_link for output_link in output_links if output_link.link_label != 'retrieved'] - - self._dump_calculation_io( - parent_path=output_path / io_dump_mapping.outputs, - link_triples=output_links, - ) - - def _dump_calculation_io(self, parent_path: Path, link_triples: LinkManager | List[LinkTriple]): - """Small helper function to dump linked input/output nodes of a `CalculationNode`. - - :param parent_path: Parent directory for dumping the linked node contents. - :param link_triples: List of link triples. - """ - - for link_triple in link_triples: - link_label = link_triple.link_label - - if not self.flat: - linked_node_path = parent_path / Path(*link_label.split('__')) - else: - # Don't use link_label at all -> But, relative path inside FolderData is retained - linked_node_path = parent_path - - link_triple.node.base.repository.copy_tree(linked_node_path.resolve()) - - def _generate_calculation_io_mapping(self, io_dump_paths: List[str | Path] | None = None) -> SimpleNamespace: - """Helper function to generate mapping for entities dumped for each `CalculationNode`. - - This is to avoid exposing AiiDA terminology, like `repository` to the user, while keeping track of which - entities should be dumped into which directory, and allowing for alternative directory names. - - :param io_dump_paths: Subdirectories created for the `CalculationNode`. - Default: ['inputs', 'outputs', 'node_inputs', 'node_outputs'] - :return: SimpleNamespace mapping. - """ - - aiida_entities_to_dump = ['repository', 'retrieved', 'inputs', 'outputs'] - default_calculation_io_dump_paths = ['inputs', 'outputs', 'node_inputs', 'node_outputs'] - empty_calculation_io_dump_paths = [''] * 4 - - if self.flat and io_dump_paths is None: - LOGGER.info( - 'Flat set to True and no `io_dump_paths`. Dumping in a flat directory, files might be overwritten.' - ) - return SimpleNamespace(**dict(zip(aiida_entities_to_dump, empty_calculation_io_dump_paths))) - - elif not self.flat and io_dump_paths is None: - LOGGER.info( - 'Flat set to False but no `io_dump_paths` provided. ' - + f'Will use the defaults {default_calculation_io_dump_paths}.' - ) - return SimpleNamespace(**dict(zip(aiida_entities_to_dump, default_calculation_io_dump_paths))) - - elif self.flat and io_dump_paths is not None: - LOGGER.info('Flat set to True but `io_dump_paths` provided. These will be used, but `inputs` not nested.') - return SimpleNamespace(**dict(zip(aiida_entities_to_dump, io_dump_paths))) - else: - LOGGER.info( - 'Flat set to False but no `io_dump_paths` provided. These will be used, but `node_inputs` flattened.' - ) - return SimpleNamespace(**dict(zip(aiida_entities_to_dump, io_dump_paths))) # type: ignore[arg-type] - - def _dump_node_yaml( - self, - process_node: ProcessNode, - output_path: Path, - output_filename: str = '.aiida_node_metadata.yaml', - ) -> None: - """Dump the selected `ProcessNode` properties, attributes, and extras to a YAML file. - - :param process_node: The `ProcessNode` to dump. - :param output_path: The path to the directory where the YAML file will be saved. - :param output_filename: The name of the output YAML file. Defaults to `.aiida_node_metadata.yaml`. - """ - - node_properties = [ - 'label', - 'description', - 'pk', - 'uuid', - 'ctime', - 'mtime', - 'node_type', - 'process_type', - 'is_finished_ok', - ] - - user_properties = ('first_name', 'last_name', 'email', 'institution') - - computer_properties = ('label', 'hostname', 'scheduler_type', 'transport_type') - - node_dict = {} - metadata_dict = {} - - # Add actual node `@property`s to dictionary - for metadata_property in node_properties: - metadata_dict[metadata_property] = getattr(process_node, metadata_property) - - node_dict['Node data'] = metadata_dict - - # Add user data - try: - node_dbuser = process_node.user - user_dict = {} - for user_property in user_properties: - user_dict[user_property] = getattr(node_dbuser, user_property) - node_dict['User data'] = user_dict - except AttributeError: - pass - - # Add computer data - try: - node_dbcomputer = process_node.computer - computer_dict = {} - for computer_property in computer_properties: - computer_dict[computer_property] = getattr(node_dbcomputer, computer_property) - node_dict['Computer data'] = computer_dict - except AttributeError: - pass - - # Add node attributes - if self.include_attributes: - node_attributes = process_node.base.attributes.all - node_dict['Node attributes'] = node_attributes - - # Add node extras - if self.include_extras: - node_extras = process_node.base.extras.all - if node_extras: - node_dict['Node extras'] = node_extras - - output_file = output_path.resolve() / output_filename - with open(output_file, 'w') as handle: - yaml.dump(node_dict, handle, sort_keys=False) diff --git a/src/aiida/tools/dumping/utils.py b/src/aiida/tools/dumping/utils.py deleted file mode 100644 index a631ac25e5..0000000000 --- a/src/aiida/tools/dumping/utils.py +++ /dev/null @@ -1,75 +0,0 @@ -########################################################################### -# Copyright (c), The AiiDA team. All rights reserved. # -# This file is part of the AiiDA code. # -# # -# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # -# For further information on the license, see the LICENSE.txt file # -# For further information please visit http://www.aiida.net # -########################################################################### -"""Utility functions for dumping features.""" - -from __future__ import annotations - -import logging -import shutil -from pathlib import Path - -__all__ = ['prepare_dump_path'] - -logger = logging.getLogger(__name__) - - -def prepare_dump_path( - path_to_validate: Path, - overwrite: bool = False, - incremental: bool = True, - safeguard_file: str = '.aiida_node_metadata.yaml', -) -> None: - """Create default dumping directory for a given process node and return it as absolute path. - - :param validate_path: Path to validate for dumping. - :param safeguard_file: Dumping-specific file that indicates that the directory indeed originated from a `verdi ... - dump` command to avoid accidentally deleting wrong directory. - Default: `.aiida_node_metadata.yaml` - :return: The absolute created dump path. - :raises ValueError: If both `overwrite` and `incremental` are set to True. - :raises FileExistsError: If a file or non-empty directory exists at the given path and none of `overwrite` or - `incremental` are enabled. - :raises FileNotFoundError: If no `safeguard_file` is found.""" - - if overwrite and incremental: - raise ValueError('Both overwrite and incremental set to True. Only specify one.') - - if path_to_validate.is_file(): - raise FileExistsError(f'A file at the given path `{path_to_validate}` already exists.') - - # Handle existing directory - if path_to_validate.is_dir(): - is_empty = not any(path_to_validate.iterdir()) - - # Case 1: Non-empty directory and overwrite is False - if not is_empty and not overwrite: - if incremental: - logger.info('Incremental dumping selected. Will keep directory.') - else: - raise FileExistsError( - f'Path `{path_to_validate}` already exists, and neither overwrite nor incremental is enabled.' - ) - - # Case 2: Non-empty directory, overwrite is True - if not is_empty and overwrite: - safeguard_exists = (path_to_validate / safeguard_file).is_file() - - if safeguard_exists: - logger.info(f'Overwriting directory `{path_to_validate}`.') - shutil.rmtree(path_to_validate) - - else: - raise FileNotFoundError( - f'Path `{path_to_validate}` exists without safeguard file ' - f'`{safeguard_file}`. Not removing because path might be a directory not created by AiiDA.' - ) - - # Create directory if it doesn't exist or was removed - path_to_validate.mkdir(exist_ok=True, parents=True) - (path_to_validate / safeguard_file).touch() diff --git a/src/aiida/tools/dumping/utils/__init__.py b/src/aiida/tools/dumping/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/aiida/tools/dumping/utils/helpers.py b/src/aiida/tools/dumping/utils/helpers.py new file mode 100644 index 0000000000..f604b74eb2 --- /dev/null +++ b/src/aiida/tools/dumping/utils/helpers.py @@ -0,0 +1,320 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime +from enum import Enum +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Set, Type, Union + +from aiida import orm +from aiida.common import timezone + +DumpEntityType = Union[orm.CalculationNode, orm.WorkflowNode, orm.Data] +QbDumpEntityType = Union[Type[orm.CalculationNode], Type[orm.WorkflowNode], Type[orm.Data]] +# StoreNameType = Literal["calculations", "workflows", "groups", "data"] + +# NOTE: Using Literal for type safety. +# NOTE: Though, possibly allow for selection via entry point +StoreNameType = Literal['calculations', 'workflows', 'groups'] + + +if TYPE_CHECKING: + pass + + +@dataclass +class DumpTimes: + """Holds relevant timestamps for a dump operation.""" + + current: datetime = field(default_factory=timezone.now) + last: Optional[datetime] = None + + @classmethod + def from_last_log_time(cls, last_log_time: str | None) -> 'DumpTimes': + """Create DumpTimes initializing `last` from an ISO time string.""" + last = None + if last_log_time: + try: + last = datetime.fromisoformat(last_log_time) + except ValueError: + # Handle potential parsing errors if necessary + pass # Or log a warning + return cls(last=last) + + +@dataclass +class DumpNodeStore: + """Store for nodes to be dumped. + + This class follows a similar structure to DumpLogger, making it easier + to convert between the two. + """ + + calculations: list = field(default_factory=list) + workflows: list = field(default_factory=list) + data: list = field(default_factory=list) + groups: list = field(default_factory=list) + + @property + def stores(self) -> dict: + """Retrieve the current state of the container as a dataclass.""" + return { + DumpStoreKeys.CALCULATIONS.value: self.calculations, + DumpStoreKeys.WORKFLOWS.value: self.workflows, + DumpStoreKeys.DATA.value: self.data, + DumpStoreKeys.GROUPS.value: self.groups, + } + + @property + def should_dump_processes(self) -> bool: + return len(self.calculations) > 0 or len(self.workflows) > 0 + + @property + def should_dump_data(self) -> bool: + return len(self.data) > 0 + + def __len__(self) -> int: + return len(self.calculations) + len(self.workflows) + len(self.data) + len(self.groups) + + def num_processes(self) -> int: + return len(self.calculations) + len(self.workflows) + + def add_nodes(self, nodes: list, node_type: Any | None = None) -> None: + """Add nodes to the appropriate store based on node_type. + + Args: + node_type: The type of nodes to add (can be a class or a string identifier) + nodes: List of nodes to add + """ + if node_type: + attr = DumpStoreKeys.from_class(node_type) + elif len(nodes) > 0: + attr = DumpStoreKeys.from_instance(nodes[0]) + else: + raise ValueError + + store: list = getattr(self, attr) + store.extend(nodes) + + def is_empty(self) -> bool: + return len(self) == 0 + + def get_store_by_name(self, name: StoreNameType) -> list: + """Get the appropriate store based on node_type. + + Args: + node_type: The type of nodes (can be a class or a string identifier) + + Returns: + The corresponding store list + """ + + store_names = list(self.stores.keys()) + if name not in store_names: + msg = f'Wrong key <{name}> selected. Choose one of {store_names}.' + raise ValueError(msg) + + return getattr(self.stores, name) + + def get_store_by_type(self, node_type: Any) -> list: + """Get the appropriate store based on node_type. + + Args: + node_type: The type of nodes (can be a class or a string identifier) + + Returns: + The corresponding store list + """ + + attr = DumpStoreKeys.from_class(node_type) + return getattr(self, attr) + + +# --- Supporting Dataclasses for Group Changes --- +@dataclass +class GroupInfo: + """Information about a group (typically for new or deleted groups).""" + + uuid: str + node_count: int = 0 + label: Optional[str] = None + + +@dataclass +class GroupModificationInfo: + """Information about modifications to an existing group's membership.""" + + uuid: str + label: str + nodes_added: List[str] = field(default_factory=list) + nodes_removed: List[str] = field(default_factory=list) + + +@dataclass +class NodeMembershipChange: + """Details how a specific node's group membership changed.""" + + added_to: List[str] = field(default_factory=list) + removed_from: List[str] = field(default_factory=list) + + +@dataclass +class GroupRenameInfo: + """Information about a group that has been renamed.""" + + uuid: str + old_path: Path + new_path: Path + new_label: str | None + + +@dataclass +class GroupChanges: + """Holds all changes related to group lifecycle and membership.""" + + deleted: List[GroupInfo] = field(default_factory=list) + new: List[GroupInfo] = field(default_factory=list) + modified: List[GroupModificationInfo] = field(default_factory=list) + renamed: List[GroupRenameInfo] = field(default_factory=list) + node_membership: Dict[str, NodeMembershipChange] = field(default_factory=dict) + + +@dataclass +class NodeChanges: + """Holds changes related to individual nodes (Calc, Work, Data).""" + + # Nodes detected as new or modified that require dumping + new_or_modified: DumpNodeStore = field(default_factory=DumpNodeStore) + # UUIDs of *nodes* detected as deleted from the database + # Note: We separate deleted nodes from deleted groups based on Option 1. + # If you need deleted group UUIDs elsewhere (like DeletionManager), + # they are available in GroupChangeInfo.deleted[...].uuid + deleted: Set[str] = field(default_factory=set) + + +# TODO: Also write those to disk?? +@dataclass +class DumpChanges: + """Represents all detected changes for a dump cycle (Recommended Structure).""" + + nodes: NodeChanges = field(default_factory=NodeChanges) + groups: GroupChanges = field(default_factory=GroupChanges) + + def to_table(self) -> str: + """Generate a tabulated summary of all changes in this dump cycle. + + Returns: + str: A formatted string containing tables that summarize the changes. + """ + from tabulate import tabulate + + # --- Node Changes Table --- + node_rows = [] + + # Add new or modified nodes by type + new_calcs = len(self.nodes.new_or_modified.calculations) + new_workflows = len(self.nodes.new_or_modified.workflows) + new_data = len(self.nodes.new_or_modified.data) + total_new = new_calcs + new_workflows + new_data + + node_rows.append(['Calculations', new_calcs, 'new/modified']) + node_rows.append(['Workflows', new_workflows, 'new/modified']) + node_rows.append(['Data', new_data, 'new/modified']) + node_rows.append(['Total', total_new, 'new/modified']) + + # Add deleted nodes + node_rows.append(['Nodes', len(self.nodes.deleted), 'deleted']) + + node_table = tabulate( + node_rows, + headers=['Entity Type', 'Count', 'Status'], + tablefmt='simple', + ) + + # --- Group Changes Table --- + group_rows = [] + + # New, modified and deleted groups + group_rows.append(['Groups', len(self.groups.new), 'new']) + group_rows.append(['Groups', len(self.groups.modified), 'modified']) + group_rows.append(['Groups', len(self.groups.deleted), 'deleted']) + + # Count node membership changes + nodes_added_to_groups = sum(len(change.added_to) for change in self.groups.node_membership.values()) + nodes_removed_from_groups = sum(len(change.removed_from) for change in self.groups.node_membership.values()) + + group_rows.append(['Node memberships', nodes_added_to_groups, 'added']) + group_rows.append(['Node memberships', nodes_removed_from_groups, 'removed']) + + group_table = tabulate( + group_rows, + headers=['Entity Type', 'Count', 'Status'], + tablefmt='simple', + ) + + # Combine tables with a header + return ( + 'Anticipated dump changes\n' + '========================\n\n' + 'Nodes:\n' + f'{node_table}\n\n' + 'Groups:\n' + f'{group_table}' + ) + + +class DumpStoreKeys(str, Enum): + CALCULATIONS = 'calculations' + WORKFLOWS = 'workflows' + GROUPS = 'groups' + DATA = 'data' + + @classmethod + def from_instance(cls, node_inst: orm.Node | orm.Group) -> StoreNameType: + if isinstance(node_inst, orm.CalculationNode): + return cls.CALCULATIONS.value + elif isinstance(node_inst, orm.WorkflowNode): + return cls.WORKFLOWS.value + elif isinstance(node_inst, orm.Data): + return cls.DATA.value + elif isinstance(node_inst, orm.Group): + return cls.GROUPS.value + else: + msg = f'Dumping not implemented yet for node type: {type(node_inst)}' + raise NotImplementedError(msg) + + @classmethod + def from_class(cls, orm_class: Type) -> StoreNameType: + if issubclass(orm_class, orm.CalculationNode): + return cls.CALCULATIONS.value + elif issubclass(orm_class, orm.WorkflowNode): + return cls.WORKFLOWS.value + elif issubclass(orm_class, orm.Data): + return cls.DATA.value + elif issubclass(orm_class, orm.Group): + return cls.GROUPS.value + else: + msg = f'Dumping not implemented yet for node type: {orm_class}' + raise NotImplementedError(msg) + + @classmethod + def to_class(cls, key: 'DumpStoreKeys') -> Type: + mapping = { + cls.CALCULATIONS: orm.CalculationNode, + cls.WORKFLOWS: orm.WorkflowNode, + cls.DATA: orm.Data, + cls.GROUPS: orm.Group, + } + if key in mapping: + return mapping[key] + else: + msg = f'No node type mapping exists for key: {key}' + raise ValueError(msg) diff --git a/src/aiida/tools/dumping/utils/paths.py b/src/aiida/tools/dumping/utils/paths.py new file mode 100644 index 0000000000..fba984496a --- /dev/null +++ b/src/aiida/tools/dumping/utils/paths.py @@ -0,0 +1,398 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### + +"""Path utility functions and classes for the dumping feature.""" + +# NOTE: Could manke many of the functions staticmethods of DumpPaths + +from __future__ import annotations + +import shutil +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path + +from aiida import orm +from aiida.common import timezone +from aiida.common.log import AIIDA_LOGGER +from aiida.manage.configuration import Profile +from aiida.tools.dumping.config import DumpMode + +logger = AIIDA_LOGGER.getChild('tools.dumping.utils.paths') + +__all__ = ('DumpPaths',) + + +@dataclass +class DumpPaths: + parent: Path = field(default_factory=Path.cwd) + child: Path = field(default_factory=lambda: Path('aiida-dump')) + _top_level: Path | None = field(default=None, init=False, repr=False) + + safeguard_file = '.aiida_dump_safeguard' + log_file: str = 'aiida_dump_log.json' + config_file: str = 'aiida_dump_config.yaml' + + def __post_init__(self): + # Set top_level during initialization if not provided + # This should not change after construction, as it should always point at the top-level dump directory + self._top_level = self.parent / self.child + + @property + def top_level(self) -> Path: + """Returns the top level path, guaranteed to be non-None.""" + assert self._top_level is not None + return self._top_level + + @classmethod + def from_path(cls, path: Path): + return cls(parent=path.parent, child=Path(path.name)) + + @property + def absolute(self) -> Path: + """Returns the absolute path by joining parent and child.""" + return self.parent / self.child + + @property + def safeguard_path(self) -> Path: + """Returns the relative path to a safeguard file.""" + return self.absolute / self.safeguard_file + + @property + def log_path(self) -> Path: + """Returns the path of the logger JSON.""" + return self.absolute / self.log_file + + # NOTE: Should this return a new instance? + def extend_paths(self, subdir: str) -> DumpPaths: + """ + Creates a new DumpPaths instance with an additional subdirectory. + + Args: + subdir: The name of the subdirectory to add + + Returns: + A new DumpPaths instance with the updated path structure + """ + return DumpPaths(parent=self.absolute, child=Path(subdir)) + + @property + def config_path(self) -> Path: + """Returns the path of the top-level config YAML.""" + assert self.top_level is not None + return self.top_level / self.config_file + + @staticmethod + def _prepare_dump_path( + path_to_validate: Path, + dump_mode: DumpMode, + safeguard_file: str = safeguard_file, + # top_level_caller: bool = True, + ) -> None: + # TODO: Add an option to clean the path here + """Create default dumping directory for a given process node and return it as absolute path. + + :param validate_path: Path to validate for dumping. + :param safeguard_file: Dumping-specific file that indicates that the directory originated from AiiDA's + dump` command to avoid accidentally deleting wrong directory. + :return: The absolute created dump path. + :raises ValueError: If both `overwrite` and `incremental` are set to True. + :raises FileExistsError: If a file or non-empty directory exists at the given path and none of `overwrite` or + `incremental` are enabled. + :raises FileNotFoundError: If no `safeguard_file` is found.""" + + if path_to_validate.is_file(): + msg = f'A file at the given path `{path_to_validate}` already exists.' + raise FileExistsError(msg) + + if not path_to_validate.is_absolute(): + msg = f'The path to validate must be an absolute path. Got `{path_to_validate}.' + raise ValueError(msg) + + # Handle existing non-empty directory + safeguard_path = path_to_validate / safeguard_file + if path_to_validate.is_dir() and any(path_to_validate.iterdir()): + # Check for safeguard first if directory is not empty + if not safeguard_path.is_file(): + # If non-empty AND safeguard is missing, it's an error for OVERWRITE and INCREMENTAL + if dump_mode in (DumpMode.OVERWRITE, DumpMode.INCREMENTAL): + msg = ( + f'Path `{path_to_validate}` exists and is not empty, but safeguard file ' + f'`{safeguard_file}` is missing. Cannot proceed in {dump_mode.name} mode ' + f'to prevent accidental data modification/loss.' + ) + logger.error(msg) + raise FileNotFoundError(msg) + # For DRY_RUN, we might just log a warning or proceed silently + # depending on desired dry-run feedback. Let's log for now. + elif dump_mode == DumpMode.DRY_RUN: + logger.warning( + f'DRY RUN: Path `{path_to_validate}` exists, is not empty, and safeguard file ' + f'`{safeguard_file}` is missing.' + ) + # Safeguard IS present and directory is non-empty + elif dump_mode == DumpMode.OVERWRITE: + DumpPaths._safe_delete_dir( + path=path_to_validate, + safeguard_file=safeguard_file, + ) + + # Check if path is symlink, otherwise `mkdir` fails + if path_to_validate.is_symlink(): + return + + # Finally, (re-)create directory + # Both shutil.rmtree and `_delete_dir_recursively` delete the original dir + # If it already existed, e.g. in the `incremental` case, exist_ok=True + path_to_validate.mkdir(exist_ok=True, parents=True) + path_to_safeguard_file = path_to_validate / safeguard_file + if not path_to_safeguard_file.is_file(): + path_to_safeguard_file.touch() + + @staticmethod + def _safe_delete_dir( + path: Path, + safeguard_file: str = safeguard_file, + ) -> None: + """Safely delete a directory and its contents if it contains the safeguard file. + Uses shutil.rmtree for robust deletion. Also deletes the top-level directory itself. + """ + if not path.exists(): + logger.debug(f"Path '{path}' does not exist, nothing to delete.") + return + + # If it's not a directory (e.g., a file or a symlink), handle differently + if not path.is_dir(): + if path.is_symlink(): + logger.debug(f"Path '{path}' is a symlink, unlinking.") + path.unlink() # missing_ok=True requires Python 3.8+ + else: + logger.debug(f"Path '{path}' is a file, unlinking.") + path.unlink() # missing_ok=True requires Python 3.8+ + return + + # Check if directory is empty *before* safeguard check + is_empty = not any(path.iterdir()) + if is_empty: + logger.debug(f"Path '{path}' is an empty directory, removing.") + path.rmdir() + return + + # Check for safeguard file existence + safeguard_path = path / safeguard_file + if not safeguard_path.is_file(): + msg = ( + f'Path `{path}` exists and is not empty, but safeguard file `{safeguard_file}` is missing. ' + f'Not removing directory to prevent accidental data loss.' + ) + logger.error(msg) # Log as error as this is a safety stop + raise FileNotFoundError(msg) # Raise exception to signal failure + + # Safeguard exists, proceed with deletion using shutil.rmtree + logger.debug(f"Safeguard file found in '{path}'. Proceeding with recursive deletion.") + try: + shutil.rmtree(path) + logger.debug(f"Successfully deleted directory tree: '{path}'") + except OSError as e: + # Catch potential errors during rmtree (e.g., permissions) + logger.error( + f"Error deleting directory tree '{path}' using shutil.rmtree: {e}", + exc_info=True, + ) + raise # Re-raise the error after logging + + @staticmethod + def _get_default_process_dump_path( + process_node: orm.ProcessNode, prefix: str | None = None, append_pk: bool = True + ) -> Path: + """Simple helper function to generate the default parent-dumping directory if none given. + + This function is not called for the recursive sub-calls of `_dump_calculation` as it just creates the default + parent folder for the dumping, if no name is given. + + :param process_node: The `ProcessNode` for which the directory is created. + :return: The absolute default parent dump path. + """ + + path_entities = [] + + if prefix is not None: + path_entities += [prefix] + + if process_node.label: + path_entities.append(process_node.label) + elif process_node.process_label is not None: + path_entities.append(process_node.process_label) + elif process_node.process_type is not None: + path_entities.append(process_node.process_type) + + if append_pk: + path_entities += [str(process_node.pk)] + return Path('-'.join(path_entities)) + + @staticmethod + def _get_default_profile_dump_path(profile: Profile, prefix: str = 'profile', appendix: str = 'dump') -> Path: + """_summary_ + + :param profile: _description_ + :param prefix: _description_, defaults to "profile" + :param appendix: _description_, defaults to "dump" + :return: _description_ + """ + return Path(f'{prefix}-{profile.name}-{appendix}') + + @staticmethod + def _get_default_group_dump_path(group: orm.Group | None, prefix: str = 'group', appendix: str = 'dump') -> Path: + if not group: + label_elements = ['ungrouped'] + + elif 'group' in group.label: + if appendix == 'group' and prefix != 'group': + label_elements = [prefix, group.label] + elif prefix == 'group' and appendix != 'group': + label_elements = [group.label, appendix] + elif prefix == 'group' and appendix == 'group': + label_elements = [group.label] + else: + label_elements = [prefix, group.label, appendix] + + elif 'dump' in group.label: + if appendix == 'dump' and prefix != 'dump': + label_elements = [prefix, group.label] + elif prefix == 'dump' and appendix != 'dump': + label_elements = [group.label, appendix] + elif prefix == 'dump' and appendix == 'dump': + label_elements = [group.label] + else: + label_elements = [prefix, group.label, appendix] + + else: + label_elements = [prefix, group.label, appendix] + + return Path('-'.join(label_elements)) + + # NOTE: Not ideal using None for entity that points to profile (default) + @staticmethod + def _resolve_click_path_for_dump( + path: Path | None | str, entity: Profile | orm.ProcessNode | orm.Group | None = None + ) -> DumpPaths: + if not isinstance(entity, (orm.ProcessNode, orm.Group, Profile)) and entity is not None: + supported_types = 'ProcessNode, Group, Profile' + msg = f"Unsupported entity type '{type(entity).__name__}'. Supported types: {supported_types}." + raise ValueError(msg) + + if path: + path = Path(path) + if path.is_absolute(): + dump_sub_path = Path(path.name) + dump_parent_path = path.parent + else: + dump_sub_path = path + dump_parent_path = Path.cwd() + else: + # Use direct isinstance checks to determine which generator to use + if isinstance(entity, Profile): + dump_sub_path = DumpPaths._get_default_profile_dump_path(entity) + elif isinstance(entity, orm.Group): + dump_sub_path = DumpPaths._get_default_group_dump_path(entity) + elif isinstance(entity, orm.ProcessNode): + dump_sub_path = DumpPaths._get_default_process_dump_path(entity) + elif entity is None: + dump_sub_path = DumpPaths._get_default_profile_dump_path(entity) + + dump_parent_path = Path.cwd() + + return DumpPaths( + parent=dump_parent_path, + child=dump_sub_path, + ) + + @staticmethod + def _get_group_path(group: orm.Group | None, organize_by_groups: bool = True) -> Path: + """Calculate and return the dump path for a specific group. + + :param group: _description_ + :param organize_by_groups: _description_, defaults to True + :return: _description_ + """ + if organize_by_groups: + if group: + # Calculate the subpath based on the group's entry point + group_entry_point = group.entry_point + if group_entry_point is None: + group_subpath = Path(group.label) + else: + group_entry_point_name = group_entry_point.name + if group_entry_point_name == 'core': + group_subpath = Path(f'{group.label}') + elif group_entry_point_name == 'core.import': + group_subpath = Path('import') / f'{group.label}' + else: + group_subpath = Path(*group_entry_point_name.split('.')) / f'{group.label}' + + # Hierarchical structure under 'groups/' using entry point/label + group_path = Path('groups') / group_subpath + else: + group_path = Path('ungrouped') + else: + # Flat structure - return the main dump path + group_path = Path('.') + + return group_path + + @staticmethod + def _get_directory_stats(path: Path) -> tuple[datetime | None, int | None]: + """ + Calculate the total size and last modification time of a directory's contents. + + Args: + path: The directory path. + + Returns: + A tuple containing: + - datetime | None: The most recent modification time among all files/dirs, + made timezone-aware (UTC assumed if naive). + - int | None: The total size in bytes of all files within the directory. + Returns None if the path doesn't exist or isn't a directory. + """ + total_size = 0 + latest_mtime_ts = 0.0 + + try: + if not path.is_dir(): + logger.debug(f'Path {path} is not a directory, cannot calculate stats.') + return None, None + + # Get mtime of the directory itself initially + latest_mtime_ts = path.stat().st_mtime + + # Iterate through all files and directories recursively + for entry in path.rglob('*'): + try: + stat_info = entry.stat() + if entry.is_file(): + total_size += stat_info.st_size + # Update latest mtime if this entry is newer + latest_mtime_ts = max(stat_info.st_mtime, latest_mtime_ts) + except (OSError, FileNotFoundError) as stat_err: + # Ignore errors for files/dirs that might disappear during iteration + logger.debug(f'Could not stat entry {entry}: {stat_err}') + + # Convert the latest timestamp to a timezone-aware datetime object + latest_mtime_dt = datetime.fromtimestamp(latest_mtime_ts) + latest_mtime_aware = timezone.make_aware(latest_mtime_dt) # Assumes local time if naive + + return latest_mtime_aware, total_size + + except (FileNotFoundError, PermissionError) as e: + logger.error(f'Could not access path {path} to calculate stats: {e}') + return None, None + except Exception as e: + logger.error(f'Unexpected error calculating stats for {path}: {e}', exc_info=True) + return None, None diff --git a/tests/cmdline/commands/test_process.py b/tests/cmdline/commands/test_process.py index 6c6709e544..021d49933e 100644 --- a/tests/cmdline/commands/test_process.py +++ b/tests/cmdline/commands/test_process.py @@ -629,36 +629,36 @@ def test_report(self, run_cli_command): assert len(result.output_lines) == 1, result.output_lines assert result.output_lines[0] == 'No log messages recorded for this entry' - def test_process_dump(self, run_cli_command, tmp_path, generate_workchain_multiply_add): - """Test verdi process dump""" - - # Only test CLI interface here, the actual functionalities of the Python API are tested in `test_processes.py` - test_path = tmp_path / 'cli-dump' - node = generate_workchain_multiply_add() - - # Giving a single identifier should print a non empty string message - options = [str(node.pk), '-p', str(test_path)] - result = run_cli_command(cmd_process.process_dump, options) - assert result.exception is None, result.output - assert 'Success:' in result.output - - # Trying to run the dumping again in the same path but with overwrite=False should raise exception - options = [str(node.pk), '-p', str(test_path), '--no-incremental'] - result = run_cli_command(cmd_process.process_dump, options, raises=True) - assert result.exit_code is ExitCode.CRITICAL - - # Works fine when using overwrite=True - options = [str(node.pk), '-p', str(test_path), '-o', '--no-incremental'] - result = run_cli_command(cmd_process.process_dump, options) - assert result.exception is None, result.output - assert 'Success:' in result.output - - # Set overwrite=True but provide bad directory, i.e. missing metadata file - (test_path / '.aiida_node_metadata.yaml').unlink() - - options = [str(node.pk), '-p', str(test_path), '-o'] - result = run_cli_command(cmd_process.process_dump, options, raises=True) - assert result.exit_code is ExitCode.CRITICAL + # def test_process_dump(self, run_cli_command, tmp_path, generate_workchain_multiply_add): + # """Test verdi process dump""" + + # # Only test CLI interface here, the actual functionalities of the Python API are tested in `test_processes.py` + # test_path = tmp_path / 'cli-dump' + # node = generate_workchain_multiply_add() + + # # Giving a single identifier should print a non empty string message + # options = [str(node.pk), '-p', str(test_path)] + # result = run_cli_command(cmd_process.process_dump, options) + # assert result.exception is None, result.output + # assert 'Success:' in result.output + + # # Trying to run the dumping again in the same path but with overwrite=False should raise exception + # options = [str(node.pk), '-p', str(test_path), '--no-incremental'] + # result = run_cli_command(cmd_process.process_dump, options, raises=True) + # assert result.exit_code is ExitCode.CRITICAL + + # # Works fine when using overwrite=True + # options = [str(node.pk), '-p', str(test_path), '-o', '--no-incremental'] + # result = run_cli_command(cmd_process.process_dump, options) + # assert result.exception is None, result.output + # assert 'Success:' in result.output + + # # Set overwrite=True but provide bad directory, i.e. missing metadata file + # (test_path / '.aiida_node_metadata.yaml').unlink() + + # options = [str(node.pk), '-p', str(test_path), '-o'] + # result = run_cli_command(cmd_process.process_dump, options, raises=True) + # assert result.exit_code is ExitCode.CRITICAL @pytest.mark.usefixtures('aiida_profile_clean') diff --git a/tests/conftest.py b/tests/conftest.py index 7ec3c94b72..c90bf8693f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -28,7 +28,7 @@ import click import pytest -from aiida import get_profile +from aiida import get_profile, orm from aiida.common.folders import Folder from aiida.common.links import LinkType from aiida.manage.configuration import Profile, get_config, load_profile @@ -854,6 +854,144 @@ def _generate_calculation_node_add(): return _generate_calculation_node_add +@pytest.fixture(scope='class') +def construct_calculation_node_add(tmp_path_factory): + def _construct_calculation_node_add(x: int = 1, y: int = 2): + import json + import textwrap + + from aiida.common import LinkType + from aiida.orm import CalcJobNode, Computer, FolderData, InstalledCode, Int + + # Create a minimal computer + # Not using any of the `aiida_localhost` or `aiida_computer_local` fixtures as they are function-scoped + created, computer = Computer.collection.get_or_create( + label='mock_computer', hostname='localhost', transport_type='core.local', scheduler_type='core.direct' + ) + if created: + computer.store() + + # Create the calculation node + calc_node = CalcJobNode(computer=computer) + + # Create input nodes + x_node = Int(x) + y_node = Int(y) + code_node = InstalledCode(computer=computer, filepath_executable='/bin/bash') + + # Store input nodes + x_node.store() + y_node.store() + code_node.store() + + # Input files + input_content = f'echo $(({x} + {y}))\n' + calc_node.base.repository.put_object_from_bytes(input_content.encode(), 'aiida.in') + + # .aiida folder content + calcinfo_dict = { + 'codes_info': [{'stdin_name': 'aiida.in', 'stdout_name': 'aiida.out', 'code_uuid': code_node.uuid}], + 'retrieve_list': ['aiida.out', '_scheduler-stdout.txt', '_scheduler-stderr.txt'], + 'uuid': calc_node.uuid, + 'file_copy_operation_order': [2, 0, 1], + } + + job_tmpl_dict = { + 'submit_as_hold': False, + 'rerunnable': False, + 'job_name': 'aiida-42', + 'sched_output_path': '_scheduler-stdout.txt', + 'shebang': '#!/bin/bash', + 'sched_error_path': '_scheduler-stderr.txt', + 'sched_join_files': False, + 'prepend_text': '', + 'append_text': '', + 'job_resource': { + 'num_machines': 1, + 'num_mpiprocs_per_machine': 1, + 'num_cores_per_machine': None, + 'num_cores_per_mpiproc': None, + 'tot_num_mpiprocs': 1, + }, + 'codes_info': [ + { + 'prepend_cmdline_params': [], + 'cmdline_params': ['/usr/bin/bash'], + 'use_double_quotes': [False, False], + 'wrap_cmdline_params': False, + 'stdin_name': 'aiida.in', + 'stdout_name': 'aiida.out', + 'stderr_name': None, + 'join_files': False, + } + ], + 'codes_run_mode': 0, + 'import_sys_environment': True, + 'job_environment': {}, + 'environment_variables_double_quotes': False, + 'max_memory_kb': None, + 'max_wallclock_seconds': 3600, + } + + calc_node.base.repository.put_object_from_bytes( + json.dumps(calcinfo_dict, indent=4).encode(), '.aiida/calcinfo.json' + ) + calc_node.base.repository.put_object_from_bytes( + json.dumps(job_tmpl_dict, indent=4).encode(), '.aiida/job_tmpl.json' + ) + + # Submit script + submit_script = textwrap.dedent("""\ + #!/bin/bash + exec > _scheduler-stdout.txt + exec 2> _scheduler-stderr.txt + + '/usr/bin/bash' < 'aiida.in' > 'aiida.out' + """) + + calc_node.base.repository.put_object_from_bytes(submit_script.encode(), '_aiidasubmit.sh') + + # Store CalcInfo in node attributes + calc_node.base.attributes.set('input_filename', 'aiida.in') + calc_node.base.attributes.set('output_filename', 'aiida.out') + + # Add input links + calc_node.base.links.add_incoming(x_node, link_type=LinkType.INPUT_CALC, link_label='x') + calc_node.base.links.add_incoming(y_node, link_type=LinkType.INPUT_CALC, link_label='y') + calc_node.base.links.add_incoming(code_node, link_type=LinkType.INPUT_CALC, link_label='code') + + # Must store CalcjobNode before I can add output files + calc_node.store() + + # Create FolderData node for retrieved + retrieved_folder = FolderData() + output_content = f'{x+y}\n'.encode() + retrieved_folder.put_object_from_bytes(output_content, 'aiida.out') + + scheduler_stdout = '\n'.encode() + scheduler_stderr = '\n'.encode() + retrieved_folder.base.repository.put_object_from_bytes(scheduler_stdout, '_scheduler-stdout.txt') + retrieved_folder.base.repository.put_object_from_bytes(scheduler_stderr, '_scheduler-stderr.txt') + retrieved_folder.store() + + retrieved_folder.base.links.add_incoming(calc_node, link_type=LinkType.CREATE, link_label='retrieved') + + # Create and link output node (sum) + output_node = Int(x + y) + output_node.store() + output_node.base.links.add_incoming(calc_node, link_type=LinkType.CREATE, link_label='sum') + + # Set process properties + calc_node.set_process_state('finished') + calc_node.set_process_label('ArithmeticAddCalculation') + calc_node.set_process_type('aiida.calculations:core.arithmetic.add') + calc_node.set_exit_status(0) + + return calc_node + + return _construct_calculation_node_add + + @pytest.fixture def generate_workchain_multiply_add(aiida_localhost): def _generate_workchain_multiply_add(): @@ -953,3 +1091,132 @@ def cat_path() -> Path: run_process = subprocess.run(['which', 'cat'], capture_output=True, check=True) path = run_process.stdout.decode('utf-8').strip() return Path(path) + + +@pytest.fixture +def generate_calculation_node_io(generate_calculation_node, tmp_path): + def _generate_calculation_node_io(entry_point: str | None = None, attach_outputs: bool = True): + import io + + import numpy as np + + from aiida.orm import ArrayData, FolderData, SinglefileData + + filename = 'file.txt' + filecontent = 'a' + singlefiledata_linklabel = 'singlefile' + folderdata_linklabel = 'folderdata' + folderdata_relpath = Path('relative_path') + arraydata_linklabel = 'arraydata' + + singlefiledata_input = SinglefileData.from_string(content=filecontent, filename=filename) + # ? Use instance for folderdata + folderdata = FolderData() + folderdata.put_object_from_filelike(handle=io.StringIO(filecontent), path=str(folderdata_relpath / filename)) # type: ignore[arg-type] + arraydata_input = ArrayData(arrays=np.ones(3)) + + # Create calculation inputs, outputs + calculation_node_inputs = { + singlefiledata_linklabel: singlefiledata_input, + folderdata_linklabel: folderdata, + arraydata_linklabel: arraydata_input, + } + + singlefiledata_output = singlefiledata_input.clone() + folderdata_output = folderdata.clone() + + if attach_outputs: + calculation_outputs = { + folderdata_linklabel: folderdata_output, + singlefiledata_linklabel: singlefiledata_output, + } + else: + calculation_outputs = None + + # Actually write repository file and then read it in when generating calculation_node + (tmp_path / filename).write_text(filecontent) + + calculation_node = generate_calculation_node( + repository=tmp_path, + inputs=calculation_node_inputs, + outputs=calculation_outputs, + entry_point=entry_point, + ) + return calculation_node + + return _generate_calculation_node_io + + +@pytest.fixture +def generate_workchain_node_io(): + def _generate_workchain_node_io(cj_nodes, store_all: bool = True, seal_all: bool = True): + """Generate an instance of a `WorkChain` that contains a sub-`WorkChain` and a `Calculation` with file io.""" + from aiida.orm import WorkflowNode + + wc_node = WorkflowNode() + wc_node_sub = WorkflowNode() + + # Add sub-workchain that calls a calculation + wc_node_sub.base.links.add_incoming(wc_node, link_type=LinkType.CALL_WORK, link_label='sub_workflow') + for cj_node in cj_nodes: + cj_node.base.links.add_incoming(wc_node_sub, link_type=LinkType.CALL_CALC, link_label='calculation') + + # Set process_state so that tests don't throw exception for build_call_graph of README generation + [cj_node.set_process_state('finished') for cj_node in cj_nodes] + wc_node.set_process_state('finished') + wc_node_sub.set_process_state('finished') + + # Need to store/seal (?) so that outputs are being dumped + if seal_all: + wc_node.seal() + wc_node_sub.seal() + [cj_node.seal() for cj_node in cj_nodes] + [node.seal() for node in wc_node.called_descendants] + + if store_all: + wc_node.store() + wc_node_sub.store() + [cj_node.store() for cj_node in cj_nodes] + [node.store() for node in wc_node.called_descendants] + + return wc_node + + return _generate_workchain_node_io + + +@pytest.fixture() +def setup_no_process_group() -> orm.Group: + no_process_group, _ = orm.Group.collection.get_or_create(label='no-process-group') + if no_process_group.is_empty: + int_node = orm.Int(1).store() + no_process_group.add_nodes([int_node]) + return no_process_group + + +# TODO: Add possibility to parametrize with number of nodes created (make factory?) +@pytest.fixture() +def setup_add_group(generate_calculation_node_add) -> orm.Group: + add_group, _ = orm.Group.collection.get_or_create(label='add-group') + if add_group.is_empty: + add_node = generate_calculation_node_add() + add_group.add_nodes([add_node]) + return add_group + + +@pytest.fixture() +def setup_multiply_add_group(generate_workchain_multiply_add) -> orm.Group: + multiply_add_group, _ = orm.Group.collection.get_or_create(label='multiply-add-group') + if multiply_add_group.is_empty: + multiply_add_node = generate_workchain_multiply_add() + multiply_add_group.add_nodes([multiply_add_node]) + return multiply_add_group + + +@pytest.fixture() +def setup_duplicate_group(): + def _setup_duplicate_group(source_group: orm.Group, dest_group_label: str): + dupl_group, created = orm.Group.collection.get_or_create(label=dest_group_label) + dupl_group.add_nodes(list(source_group.nodes)) + return dupl_group + + return _setup_duplicate_group diff --git a/tests/tools/dumping/__init__.py b/tests/tools/dumping/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/tools/dumping/conftest.py b/tests/tools/dumping/conftest.py new file mode 100644 index 0000000000..dea35558af --- /dev/null +++ b/tests/tools/dumping/conftest.py @@ -0,0 +1,30 @@ +from pathlib import Path + +import pytest + +from aiida.tools.dumping.config import DumpConfig +from aiida.tools.dumping.logger import DumpLogger, DumpLogStoreCollection +from aiida.tools.dumping.managers.process import ProcessDumpManager +from aiida.tools.dumping.utils.helpers import DumpTimes +from aiida.tools.dumping.utils.paths import DumpPaths + + +@pytest.fixture +def mock_dump_logger(tmp_path): + """Fixture providing a DumpLogger instance without loading from file.""" + # Use a dummy path for the logger, actual file interaction is bypassed + dump_paths = DumpPaths(parent=tmp_path, child=Path('mock_dump')) + stores = DumpLogStoreCollection() + return DumpLogger(dump_paths=dump_paths, stores=stores, last_dump_time_str=None) + + +@pytest.fixture +def process_dump_manager(mock_dump_logger, tmp_path): + """Fixture providing an initialized ProcessDumpManager.""" + config = DumpConfig() # Default config + dump_paths = DumpPaths(parent=tmp_path, child=Path('manager_test_dump')) + dump_times = DumpTimes() + manager = ProcessDumpManager( + config=config, dump_paths=dump_paths, dump_logger=mock_dump_logger, dump_times=dump_times + ) + return manager diff --git a/tests/tools/dumping/managers/test_process_dump_manager.py b/tests/tools/dumping/managers/test_process_dump_manager.py new file mode 100644 index 0000000000..705b88b086 --- /dev/null +++ b/tests/tools/dumping/managers/test_process_dump_manager.py @@ -0,0 +1,192 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### + +from pathlib import Path + +import pytest +import yaml + +from aiida import orm +from aiida.common.links import LinkType +from aiida.tools.dumping.config import DumpConfig +from aiida.tools.dumping.managers.process import NodeRepoIoDumper, WorkflowWalker + + +@pytest.mark.usefixtures('aiida_profile_clean') +class TestProcessDumpManager: + """Tests the ProcessDumpManager logic.""" + + def test_dump_calculation_content(self, process_dump_manager, generate_calculation_node_io, tmp_path): + """Test the internal _dump_calculation_content method.""" + node = generate_calculation_node_io(attach_outputs=True) + node.seal() + dump_target_path = tmp_path / f'calc_dump_{node.pk}' + dump_target_path.mkdir() # Manager expects path to exist + + # Enable outputs for this test + process_dump_manager.config.include_outputs = True + process_dump_manager.repo_io_dumper._dump_calculation_content(node, dump_target_path) + + # Verify structure (similar to facade test, but focused on manager's output) + assert (dump_target_path / 'inputs' / 'file.txt').is_file() + assert (dump_target_path / 'node_inputs' / 'singlefile' / 'file.txt').is_file() + assert (dump_target_path / 'node_outputs' / 'singlefile' / 'file.txt').is_file() + # Check content + assert (dump_target_path / 'inputs' / 'file.txt').read_text() == 'a' + + def test_dump_calculation_no_inputs(self, process_dump_manager, generate_calculation_node_io, tmp_path): + """Test dumping calculation without inputs.""" + node = generate_calculation_node_io(attach_outputs=True) + node.seal() + dump_target_path = tmp_path / f'calc_dump_no_inputs_{node.pk}' + dump_target_path.mkdir() + + process_dump_manager.config.include_inputs = False + process_dump_manager.config.include_outputs = True # Keep outputs + process_dump_manager.repo_io_dumper._dump_calculation_content(node, dump_target_path) + + assert not (dump_target_path / 'node_inputs').exists() + assert (dump_target_path / 'node_outputs').exists() + assert (dump_target_path / 'inputs' / 'file.txt').is_file() # Repo 'inputs' always dumped + + def test_dump_calculation_no_outputs(self, process_dump_manager, generate_calculation_node_io, tmp_path): + """Test dumping calculation without outputs.""" + node = generate_calculation_node_io(attach_outputs=True) + node.seal() + dump_target_path = tmp_path / f'calc_dump_no_outputs_{node.pk}' + dump_target_path.mkdir() + + process_dump_manager.config.include_inputs = True + process_dump_manager.config.include_outputs = False + process_dump_manager.repo_io_dumper._dump_calculation_content(node, dump_target_path) + + assert (dump_target_path / 'node_inputs').exists() + assert not (dump_target_path / 'node_outputs').exists() + assert not (dump_target_path / 'outputs').exists() # 'retrieved' repo also skipped + + def test_generate_child_node_label(self, process_dump_manager, generate_workchain_multiply_add): + """Test the child node label generation.""" + wc_node = generate_workchain_multiply_add() + # Get outgoing links (assuming fixture creates them) + called_links = wc_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() + called_links = sorted(called_links, key=lambda link_triple: link_triple.node.ctime) + + labels = [ + process_dump_manager._generate_child_node_label(idx, triple) + for idx, triple in enumerate(called_links, start=1) + ] + # Example PKs will differ, check the structure + assert labels[0].startswith('01-multiply-') + assert labels[1].startswith('02-ArithmeticAddCalculation-') + # Note: The fixture might create a 'result' node which isn't CALL_CALC/WORK + # Adjust expectation based on actual fixture links + + def test_write_node_yaml(self, process_dump_manager, generate_calculation_node_add, tmp_path): + """Test writing the node metadata YAML.""" + node = generate_calculation_node_add() + dump_target_path = tmp_path / f'yaml_test_{node.pk}' + dump_target_path.mkdir() + + process_dump_manager.metadata_writer._write(node, dump_target_path) + yaml_path = dump_target_path / '.aiida_node_metadata.yaml' + assert yaml_path.is_file() + + with open(yaml_path) as f: + data = yaml.safe_load(f) + + assert 'Node data' in data + assert 'User data' in data + assert 'Computer data' in data + assert 'Node attributes' in data + assert 'Node extras' not in data + + # Test without attributes/extras + process_dump_manager.config.include_attributes = False + process_dump_manager.config.include_extras = False + yaml_path.unlink() + process_dump_manager.metadata_writer._write(node, dump_target_path) + with open(yaml_path) as f: + data_no_attr = yaml.safe_load(f) + + assert 'Node attributes' not in data_no_attr + assert 'Node extras' not in data_no_attr + + def test_generate_readme(self, process_dump_manager, generate_workchain_multiply_add, tmp_path): + """Test README generation.""" + node = generate_workchain_multiply_add() + dump_target_path = tmp_path / f'readme_test_{node.pk}' + dump_target_path.mkdir() + + process_dump_manager.readme_generator._generate(node, dump_target_path) + readme_path = dump_target_path / 'README.md' + assert readme_path.is_file() + content = readme_path.read_text() + + assert f'AiiDA Process Dump: {node.process_label} <{node.pk}>' in content + assert 'Process Status' in content + assert 'Process Report' in content + # assert 'Node Info' in content # Removed from manager for now + assert 'ArithmeticAddCalculation' in content # Check for child node name + + +# === Tests for classes used by ProcessDumpManager === + + +@pytest.mark.usefixtures('aiida_profile_clean') +class TestProcessManagerHelpers: + """Tests helper classes used by ProcessDumpManager.""" + + def test_node_repo_io_dumper_mapping(self): + """Test the IO mapping generation.""" + dumper_normal = NodeRepoIoDumper(DumpConfig(flat=False)) + mapping_normal = dumper_normal._generate_calculation_io_mapping(flat=False) + assert mapping_normal.repository == 'inputs' + assert mapping_normal.retrieved == 'outputs' + assert mapping_normal.inputs == 'node_inputs' + assert mapping_normal.outputs == 'node_outputs' + + dumper_flat = NodeRepoIoDumper(DumpConfig(flat=True)) + mapping_flat = dumper_flat._generate_calculation_io_mapping(flat=True) + assert mapping_flat.repository == '' + assert mapping_flat.retrieved == '' + assert mapping_flat.inputs == '' + assert mapping_flat.outputs == '' + + def test_workflow_walker(self, generate_workchain_multiply_add, tmp_path): + """Test the WorkflowWalker traversal.""" + wc_node = generate_workchain_multiply_add() + dump_target_path = tmp_path / f'walker_test_{wc_node.pk}' + dump_target_path.mkdir() + + dumped_children = {} # Store {uuid: path} of dumped children + + def mock_dump_processor(node: orm.ProcessNode, path: Path): + """Mock function to record which children are processed.""" + dumped_children[node.uuid] = path + + walker = WorkflowWalker(dump_processor=mock_dump_processor) + walker._dump_children(wc_node, dump_target_path) + + # Check that children were processed + called_links = wc_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() + child_uuids = {link.node.uuid for link in called_links} + + assert set(dumped_children.keys()) == child_uuids + + # Check paths look correct (e.g., contain the child label) + multiply_node = called_links[0].node + multiply_path = dumped_children[multiply_node.uuid] + assert multiply_path.parent == dump_target_path + assert multiply_path.name.startswith('01-multiply-') + + # Example for the second child (add) + add_node = called_links[1].node + add_path = dumped_children[add_node.uuid] + assert add_path.parent == dump_target_path + assert add_path.name.startswith('02-ArithmeticAddCalculation-') diff --git a/tests/tools/dumping/test_facades.py b/tests/tools/dumping/test_facades.py new file mode 100644 index 0000000000..4ca82a9fa0 --- /dev/null +++ b/tests/tools/dumping/test_facades.py @@ -0,0 +1,1286 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Tests for the dumping of profile data to disk.""" + +import logging +import time +from typing import Any, Dict, List, Optional, Tuple + +import pytest + +from aiida import orm +from aiida.common.log import AIIDA_LOGGER +from aiida.tools.dumping import GroupDumper, ProcessDumper, ProfileDumper +from aiida.tools.dumping.config import DumpConfig, DumpMode, ProfileDumpSelection +from aiida.tools.dumping.utils.paths import DumpPaths + +from .utils import compare_tree + +# TODO: Also verify the log updates +# TODO: Verify Computer/code selection + +# NOTE: There exists `create_file_hierarchy` and `serialize_file_hierarchy` fixtures + +logger = AIIDA_LOGGER.getChild('tools.dumping.tests') + +profile_dump_label = 'profile-dump' +add_group_label = 'add-group' +multiply_add_group_label = 'multiply-add-group' +add_group_dump_label = f'{add_group_label}-dump' +multiply_add_group_dump_label = f'{multiply_add_group_label}-dump' +sub_calc_group_label = 'sub-calc-group' +sub_calc_group_dump_label = f'{sub_calc_group_label}-dump' + +# --- Content Definitions for Dumped Nodes --- + +_ADD_CALC_INPUT_CONTENT = [ + '_aiidasubmit.sh', + 'aiida.in', + {'.aiida': ['calcinfo.json', 'job_tmpl.json']}, +] + +_ADD_CALC_OUTPUT_CONTENT = [ + '_scheduler-stderr.txt', + '_scheduler-stdout.txt', + 'aiida.out', +] + +# Content for a simple calculation node like ArithmeticAddCalculation +_ADD_CALC_NODE_CONTENT = [ + '.aiida_dump_safeguard', + '.aiida_node_metadata.yaml', + {'inputs': _ADD_CALC_INPUT_CONTENT}, + {'outputs': _ADD_CALC_OUTPUT_CONTENT}, +] + +# Content for a simple function node like 'multiply' +_MULTIPLY_FUNC_NODE_CONTENT = [ + '.aiida_node_metadata.yaml', + '.aiida_dump_safeguard', + {'inputs': ['source_file']}, # Assuming multiply function only has this repo file +] + +# --- Content Definitions for IO Calc Nodes --- +_IO_CALC_INPUT_REPO_CONTENT = ['file.txt'] +_IO_CALC_INPUT_NODE_CONTENT = [ + {'arraydata': ['default.npy']}, + {'folderdata': [{'relative_path': ['file.txt']}]}, # Represents FolderData repository + {'singlefile': ['file.txt']}, +] +_IO_CALC_OUTPUT_NODE_CONTENT = [ + {'folderdata': [{'relative_path': ['file.txt']}]}, + {'singlefile': ['file.txt']}, +] + +# Content list for a standard nested dump of the IO Calc +_IO_CALC_NODE_CONTENT_NESTED = [ + '.aiida_dump_safeguard', + '.aiida_node_metadata.yaml', + {'inputs': _IO_CALC_INPUT_REPO_CONTENT}, + {'node_inputs': _IO_CALC_INPUT_NODE_CONTENT}, + {'node_outputs': _IO_CALC_OUTPUT_NODE_CONTENT}, +] + +_IO_CALC_NODE_CONTENT_NESTED_NO_OUTPUTS = [ + '.aiida_dump_safeguard', + '.aiida_node_metadata.yaml', + {'inputs': _IO_CALC_INPUT_REPO_CONTENT}, + {'node_inputs': _IO_CALC_INPUT_NODE_CONTENT}, + # No 'node_outputs' key here +] + +# Content list for a flat dump of the IO Calc +# NOTE: Assumes flat dump copies files directly. Exact structure might vary. +_IO_CALC_NODE_CONTENT_FLAT = [ + 'README.md', # Standard file added by ProcessDumper + 'aiida_dump_log.json', + 'aiida_dump_config.yaml', + '.aiida_dump_safeguard', + '.aiida_node_metadata.yaml', + # Files from repo/nodes flattened + 'file.txt', + # From inputs repo, node_inputs/singlefile, node_inputs/folderdata, + # node_outputs/singlefile, node_outputs/folderdata (potentially overwritten if names clash) + 'default.npy', # From node_inputs/arraydata + # Note: Representing folders in flat structure is ambiguous in this dict format. + # The original tree_dump_calculation_io_flat seemed incomplete/potentially incorrect. + # This flat representation might need adjustment based on actual dumper behavior. + # For robustness, testing flat dumps might be better done via content verification (Strategy 2). +] + + +# --- Dynamic Node Tree Generation Helpers --- +def get_expected_io_calc_tree(pk: int, process_label: str = 'CalculationNodeWithIO') -> Dict[str, List[Any]]: + """Generates the expected nested dump tree dict for the IO CalculationNode.""" + node_dir_name = f'{process_label}-{pk}' + return {node_dir_name: _IO_CALC_NODE_CONTENT_NESTED} + + +def get_expected_io_calc_tree_flat(pk: int, process_label: str = 'CalculationNodeWithIO') -> Dict[str, List[Any]]: + """Generates the expected flat dump tree dict for the IO CalculationNode.""" + node_dir_name = f'{process_label}-{pk}' + return {node_dir_name: _IO_CALC_NODE_CONTENT_FLAT} + + +def get_expected_add_calc_tree(pk: int) -> Dict[str, List[Any]]: + """Generates the expected dump tree dict for an ArithmeticAddCalculation.""" + node_dir_name = f'ArithmeticAddCalculation-{pk}' + return {node_dir_name: _ADD_CALC_NODE_CONTENT} + + +def get_expected_multiply_func_tree(pk: int) -> Dict[str, List[Any]]: + """Generates the expected dump tree dict for a 'multiply' function node.""" + node_dir_name = f'multiply-{pk}' # Assuming 'multiply' is the consistent label part + return {node_dir_name: _MULTIPLY_FUNC_NODE_CONTENT} + + +def get_expected_multiply_add_wc_tree(wc_pk: int, child_pks: Tuple[int, int]) -> Dict[str, List[Any]]: + """Generates the expected dump tree dict for a MultiplyAddWorkChain.""" + wc_process_label = 'MultiplyAddWorkChain' + node_dir_name = f'{wc_process_label}-{wc_pk}' + multiply_pk, add_pk = child_pks + + # Get the tree structures for children using their helpers + multiply_child_tree = get_expected_multiply_func_tree(multiply_pk) + add_child_tree = get_expected_add_calc_tree(add_pk) + + # Extract keys to use in parent structure (assumes '01-' and '02-' prefixes) + multiply_dir_key = next(iter(multiply_child_tree.keys())) + add_dir_key = next(iter(add_child_tree.keys())) + + return { + node_dir_name: [ + '.aiida_dump_safeguard', + '.aiida_node_metadata.yaml', + {f'01-{multiply_dir_key}': multiply_child_tree[multiply_dir_key]}, + {f'02-{add_dir_key}': add_child_tree[add_dir_key]}, + ] + } + + +# --- Helper for the WorkChain with IO children --- +def get_expected_io_wc_tree( + wc_pk: int, + child_pks: Tuple[int, int], # Expecting PKs of the two IO Calcs + wc_process_label: str = 'WorkChainNodeWithIO', # Assumed label for the test WC + child_process_label: str = 'CalculationNodeWithIO', # Assumed label for the IO calcs +) -> Dict[str, List[Any]]: + """ + Generates the expected dump tree for the test WorkChain with IO children. + Assumes two children called in sequence. + """ + wc_node_dir_name = f'{wc_process_label}-{wc_pk}' + + # Get the tree structures for the children using their specific helper + # Note: We use the *content* list (_IO_CALC_NODE_CONTENT_NESTED) directly + # to avoid creating intermediate single-node dicts here. + child1_dir_name = f'{child_process_label}-{child_pks[0]}' + child2_dir_name = f'{child_process_label}-{child_pks[1]}' + + wc_content = [ + '.aiida_dump_safeguard', + '.aiida_node_metadata.yaml', + # Nest child 1 (assuming 01- prefix) + { + f'01-{child1_dir_name}': _IO_CALC_NODE_CONTENT_NESTED # Use the predefined content list + }, + # Nest child 2 (assuming 02- prefix) + { + f'02-{child2_dir_name}': _IO_CALC_NODE_CONTENT_NESTED # Use the predefined content list + }, + ] + + return {wc_node_dir_name: wc_content} + + +# --- Helper for the specific nested WorkChain with IO children --- +def get_expected_nested_io_wc_tree( + wc_pk: int, + wc_sub_pk: int, + child_calc_pks: Tuple[int, int], # Expecting PKs of the two IO Calcs called by sub-WC + wc_process_label: str = 'WorkflowNode', # Default from fixture + # Labels below are NOT used for nested directory names, only PKs are. + wc_sub_process_label: str = 'WorkflowNode', # Default from fixture + child_process_label: str = 'CalculationNodeWithIO', # Assumed label for the IO calcs + # Assume standard link labels used by the fixture + wc_to_sub_link_label: str = 'sub_workflow', + sub_to_calc_link_label: str = 'calculation', +) -> Dict[str, List[Any]]: + """ + Generates the expected dump tree for the test nested WorkChain with IO children. + Assumes wc_node calls wc_node_sub, which calls the two calculation nodes. + Uses numerical prefixes based on observed dumper behavior. + Uses content definition appropriate for whether outputs were attached. + Uses correct nested directory naming convention: {prefix}-{link_label}-{child_pk}. + """ + # Top-level directory name uses label and PK + wc_node_dir_name = f'{wc_process_label}-{wc_pk}' + + # Content for children remains the same + child1_content = _IO_CALC_NODE_CONTENT_NESTED_NO_OUTPUTS + child2_content = _IO_CALC_NODE_CONTENT_NESTED_NO_OUTPUTS + + # Build the sub-workflow's content, nesting the calculations + # Keys now use: {prefix}-{link_label}-{CHILD_PK} + # *** ADJUSTED KEYS *** + wc_sub_content = [ + '.aiida_dump_safeguard', + '.aiida_node_metadata.yaml', + # Assuming calculations are called in sequence (01-, 02-) + {f'01-{sub_to_calc_link_label}-{child_calc_pks[0]}': child1_content}, # Nested child 1 dir key uses PK + {f'02-{sub_to_calc_link_label}-{child_calc_pks[1]}': child2_content}, # Nested child 2 dir key uses PK + ] + + # Build the main workflow's content, nesting the sub-workflow + # Key now uses: {prefix}-{link_label}-{CHILD_PK} + wc_content = [ + '.aiida_dump_safeguard', + '.aiida_node_metadata.yaml', + # Assuming sub-workflow is the first thing called (01-) + {f'01-{wc_to_sub_link_label}-{wc_sub_pk}': wc_sub_content}, # Nested sub-workflow dir key uses PK + ] + + return {wc_node_dir_name: wc_content} + + +# --- Dynamic Archive Assembly Helpers --- +def _assemble_nodes_by_type(node_trees: List[Dict]) -> Dict[str, List[Dict]]: + """Helper to group node tree dicts by type.""" + grouped_by_type: Dict[str, List[Dict]] = {'calculations': [], 'workflows': [], 'misc': []} + for node_tree in node_trees: + node_key = next(iter((node_tree.keys()))) + + if 'Calculation' in node_key or 'multiply' in node_key: + grouped_by_type['calculations'].append(node_tree) + elif 'WorkChain' in node_key: # Keywords for workflows/functions + grouped_by_type['workflows'].append(node_tree) + else: + grouped_by_type['misc'].append(node_tree) # Fallback category + # Remove empty categories + return {k: v for k, v in grouped_by_type.items() if v} + + +def get_expected_profile_dump_tree( + groups_data: Optional[Dict[str, List[Dict]]] = None, + ungrouped_data: Optional[List[Dict]] = None, + organize_by_groups: bool = True, +) -> Dict[str, List[Any]]: + """ + Generates the expected profile dump tree structure dynamically. + + Args: + groups_data: Dict mapping group_label to list of node tree dicts for that group. + Example: {'add-group': [calc_tree1], 'multiply-add-group': [wc_tree1]} + ungrouped_data: List of node tree dicts for ungrouped nodes. + organize_by_groups: If True, nests nodes under group dirs/type subdirs. + If False, places all nodes directly under top-level type dirs. + + Returns: + A dictionary representing the expected file/directory tree structure. + """ + top_level_content = [ + 'aiida_dump_log.json', + 'aiida_dump_config.yaml', + '.aiida_dump_safeguard', + ] + + if organize_by_groups: + group_entries = [] + if groups_data: + for label, node_trees in groups_data.items(): + grouped_nodes_by_type = _assemble_nodes_by_type(node_trees) + group_content = ['.aiida_dump_safeguard'] # Safeguard inside each group dir + + # Iterate through the assembled types and add a dictionary for each + for type_label, trees in grouped_nodes_by_type.items(): + # No need to check for emptiness again, _assemble_nodes_by_type did it + group_content.append({type_label: trees}) # Append {'calculations': [...]} etc. + + group_entries.append({label: group_content}) + + if group_entries: + top_level_content.append({'groups': group_entries}) + + if ungrouped_data: + ungrouped_nodes_by_type = _assemble_nodes_by_type(ungrouped_data) + # Check if there's actually anything to add for the 'ungrouped' directory + if ungrouped_nodes_by_type: + ungrouped_entry = ['.aiida_dump_safeguard'] # Safeguard for ungrouped dir + for type_label, trees in ungrouped_nodes_by_type.items(): + # No need to check for emptiness again + ungrouped_entry.append({type_label: trees}) + top_level_content.append({'ungrouped': ungrouped_entry}) + + else: # Not organized by groups (flat structure at top level by type) + all_node_trees = [] + if groups_data: + for node_list in groups_data.values(): + all_node_trees.extend(node_list) + # If also_ungrouped=True was used to generate the list, they are already included. + # If only specific groups were selected but ungrouped=True, they might be passed here. + if ungrouped_data: + all_node_trees.extend(ungrouped_data) + + nodes_by_type = _assemble_nodes_by_type(all_node_trees) + + # Add top-level directories for each node type + for type_label, trees in nodes_by_type.items(): + # No need to check for emptiness again + top_level_content.append({type_label: trees}) # Append {'calculations': [...]} etc. + + return {profile_dump_label: top_level_content} + + +def get_expected_group_dump_tree(dump_label: str, node_trees: List[Dict]) -> Dict[str, List[Any]]: + """Generates the expected tree for a GroupDumper output.""" + content = [ + 'aiida_dump_log.json', + '.aiida_dump_safeguard', + 'aiida_dump_config.yaml', + ] + nodes_by_type = _assemble_nodes_by_type(node_trees) + + # Iterate through the sorted types and append a dictionary for each subdirectory + for type_label, trees in nodes_by_type.items(): + # The value 'trees' is already the list of node dictionaries [{node1: content}, {node2: content}] + content.append({type_label: trees}) # Append {'calculations': [...]} or {'workflows': [...]} + + return {dump_label: content} + + +class TestProcessDumper: + """Tests the ProcessDumper facade.""" + + # test_init_and_verify: Remains largely the same, uses node.pk/uuid/process_label + # test_from_config: Remains the same, tests config loading + # test_dump_unsealed_raises: Remains the same + # test_dump_unsealed_allowed: Remains the same + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_facade_wc_io(self, generate_calculation_node_io, generate_workchain_node_io, tmp_path): + """Test dumping WorkChain with IO using ProcessDumper (nested).""" + # Setup + cj_nodes = [ + generate_calculation_node_io(attach_outputs=False), + generate_calculation_node_io(attach_outputs=False), + ] + wc_node = generate_workchain_node_io(cj_nodes=cj_nodes) # Fixture seals and stores + wc_pk = wc_node.pk + wc_process_label = wc_node.process_label or 'WorkflowNode' # Get actual label or default + + # --- Get PKs of the nested structure --- + called_workflows = wc_node.called # Should contain wc_node_sub + assert len(called_workflows) == 1, 'Expected one called sub-workflow' + wc_node_sub = called_workflows[0] + wc_sub_pk = wc_node_sub.pk + wc_sub_process_label = wc_node_sub.process_label or 'WorkflowNode' + + called_calcs = wc_node_sub.called # Calcs called by the sub-workflow + assert len(called_calcs) == 2, 'Expected two called calculations from sub-workflow' + # Sort PKs for consistent order + child_calc_pks = tuple(sorted([n.pk for n in called_calcs])) + # Get label from one of the children (assuming they are the same type) + child_process_label = called_calcs[0].process_label or 'CalculationNodeWithIO' + # --- End PK gathering --- + + # --- Generate the expected tree using the CORRECT helper --- + expected_wc_content_tree = get_expected_nested_io_wc_tree( + wc_pk=wc_pk, + wc_sub_pk=wc_sub_pk, + child_calc_pks=child_calc_pks, + wc_process_label=wc_process_label, + wc_sub_process_label=wc_sub_process_label, + child_process_label=child_process_label, + ) + # --- End dynamic generation --- + + dump_label = f'{wc_process_label}-{wc_pk}' + dump_target_path = tmp_path / dump_label + config = DumpConfig(dump_mode=DumpMode.OVERWRITE) + process_dumper = ProcessDumper(process=wc_node, config=config, output_path=dump_target_path) + + # Dump + process_dumper.dump() + + # Create the final expected structure including standard files + expected_tree_content = expected_wc_content_tree[dump_label] + expected_tree_final = { + dump_label: [ + 'README.md', + 'aiida_dump_log.json', + 'aiida_dump_config.yaml', + ] + + expected_tree_content # Add standard files to node content list + } + + compare_tree(expected=expected_tree_final, base_path=tmp_path) + assert (dump_target_path / 'README.md').is_file() + assert (dump_target_path / 'aiida_dump_config.yaml').is_file() + assert (dump_target_path / 'aiida_dump_log.json').is_file() + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_facade_multiply_add(self, tmp_path, generate_workchain_multiply_add): + """Test dumping MultiplyAddWorkChain using ProcessDumper (nested and flat).""" + wc_node = generate_workchain_multiply_add() + wc_pk = wc_node.pk + child_pks = tuple(sorted([n.pk for n in wc_node.called_descendants])) + assert len(child_pks) == 2 + dump_label = f'{wc_node.process_label}-{wc_pk}' + + # --- Nested Dump --- + dump_target_path_nested = tmp_path / dump_label + config_nested = DumpConfig(dump_mode=DumpMode.OVERWRITE, include_outputs=True) # Include outputs + process_dumper_nested = ProcessDumper( + process=wc_node, config=config_nested, output_path=dump_target_path_nested + ) + process_dumper_nested.dump() + + # Generate expected nested tree + expected_wc_content_tree = get_expected_multiply_add_wc_tree(wc_pk=wc_pk, child_pks=child_pks) + expected_tree_content_nested = expected_wc_content_tree[dump_label] + expected_tree_nested = { + dump_label: [ + 'README.md', + 'aiida_dump_log.json', + 'aiida_dump_config.yaml', + ] + + expected_tree_content_nested + } + compare_tree(expected=expected_tree_nested, base_path=tmp_path) + + # --- Flat Dump --- + # Requires a specific helper or adaptation based on actual flat structure + # Using the previously defined flat helper might be inaccurate. + # It's often better to test flat dumps via content verification or import. + # If sticking to compare_tree: + # dump_target_path_flat = tmp_path / f'{dump_label}-flat' # Use different dir + # config_flat = DumpConfig(flat=True, dump_mode=DumpMode.OVERWRITE, include_outputs=True) + # process_dumper_flat = ProcessDumper(process=wc_node, config=config_flat, output_path=dump_target_path_flat) + # process_dumper_flat.dump() + # expected_tree_flat = get_expected_multiply_add_wc_tree_flat(wc_pk=wc_pk, child_pks=child_pks) # Needs defining + # compare_tree(expected=expected_tree_flat, base_path=tmp_path) + pass # Skipping flat compare_tree for WC as helper is complex/unverified + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_facade_calculation_io(self, tmp_path, generate_calculation_node_io): + """Test dumping a CalculationNode with complex IO using ProcessDumper.""" + calculation_node = generate_calculation_node_io(attach_outputs=True) + calculation_node.seal() + calc_pk = calculation_node.pk + # Try to get a more specific label if possible from fixture, else use generic + process_label = getattr(calculation_node, 'process_label', 'CalculationNodeWithIO') + dump_label = f'{process_label}-{calc_pk}' + dump_target_path = tmp_path / dump_label + + config = DumpConfig(include_outputs=True, dump_mode=DumpMode.OVERWRITE) + process_dumper = ProcessDumper(process=calculation_node, config=config, output_path=dump_target_path) + process_dumper.dump() + + # Generate expected tree + expected_node_tree = get_expected_io_calc_tree(pk=calc_pk, process_label=process_label) + expected_tree_content = expected_node_tree[dump_label] + expected_tree = { + dump_label: [ + 'README.md', + 'aiida_dump_log.json', + 'aiida_dump_config.yaml', + ] + + expected_tree_content + } + compare_tree(expected=expected_tree, base_path=tmp_path) + + # Content checks remain valuable + file_path = dump_target_path / 'inputs' / 'file.txt' + assert file_path.read_text() == 'a' + node_input_path = dump_target_path / 'node_inputs' / 'singlefile' / 'file.txt' + assert node_input_path.read_text() == 'a' + node_output_path = dump_target_path / 'node_outputs' / 'singlefile' / 'file.txt' + assert node_output_path.read_text() == 'a' # Assuming output is same as input for this test node + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_facade_calculation_flat(self, tmp_path, generate_calculation_node_io): + """Test flat dumping of a CalculationNode using ProcessDumper.""" + # As noted before, the exact flat structure representation and verification + # using compare_tree is tricky and potentially fragile. + # Content verification (checking file existence/content at the top level) + # or dump/import testing might be more suitable for flat dumps. + # Skipping the compare_tree part for flat dump here. + calculation_node = generate_calculation_node_io(attach_outputs=True) + calculation_node.seal() + calc_pk = calculation_node.pk + process_label = getattr(calculation_node, 'process_label', 'CalculationNodeWithIO') + dump_label = f'{process_label}-{calc_pk}-flat' # Use different name + dump_target_path = tmp_path / dump_label + + config = DumpConfig(flat=True, include_outputs=True, dump_mode=DumpMode.OVERWRITE) + process_dumper = ProcessDumper(process=calculation_node, config=config, output_path=dump_target_path) + process_dumper.dump() + + # Perform basic checks instead of full compare_tree for flat dump + assert (dump_target_path / '.aiida_node_metadata.yaml').is_file() + assert (dump_target_path / 'aiida_dump_log.json').is_file() + assert (dump_target_path / 'file.txt').is_file() # Check a key file is flattened + assert (dump_target_path / 'default.npy').is_file() + # Add more specific checks if needed based on expected flat output + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_facade_calculation_add(self, tmp_path, generate_calculation_node_add): + """Test dumping ArithmeticAddCalculation using ProcessDumper.""" + calculation_node = generate_calculation_node_add() # Fixture runs and seals + calc_pk = calculation_node.pk + process_label = calculation_node.process_label + dump_label = f'{process_label}-{calc_pk}' + dump_target_path = tmp_path / dump_label + + config = DumpConfig(include_outputs=True, dump_mode=DumpMode.OVERWRITE) + process_dumper = ProcessDumper(process=calculation_node, config=config, output_path=dump_target_path) + process_dumper.dump() + + # Generate expected tree using the node helper's content + expected_node_content = get_expected_add_calc_tree(pk=calc_pk)[dump_label] + expected_tree = { + dump_label: [ + 'README.md', + 'aiida_dump_log.json', + 'aiida_dump_config.yaml', + ] + + expected_node_content + } + compare_tree(expected=expected_tree, base_path=tmp_path) + + # test_dump_overwrite_incremental can likely remain as is, as it primarily + # tests file presence/absence and log content, not deep structure matching. + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_overwrite_incremental(self, tmp_path, generate_calculation_node_add): + """Tests overwrite and incremental dumping via the facade.""" + node = generate_calculation_node_add() + calc_pk = node.pk + process_label = node.process_label + dump_label = f'{process_label}-{calc_pk}' + dump_path = tmp_path / dump_label # Dump path is the node dir itself + + dump_path.mkdir() + (dump_path / 'dummy.txt').touch() # Make non-empty but without safeguard + + # 1. Test default (incremental) fails on non-empty dir without safeguard + config_incr_fail = DumpConfig(dump_mode=DumpMode.INCREMENTAL) + dumper_incr_fail = ProcessDumper(process=node, config=config_incr_fail, output_path=dump_path) + with pytest.raises(FileNotFoundError, match='but safeguard file'): + dumper_incr_fail.dump() + + # 2. Test overwrite fails on non-empty dir without safeguard + config_over_fail = DumpConfig(dump_mode=DumpMode.OVERWRITE) + dumper_over_fail = ProcessDumper(process=node, config=config_over_fail, output_path=dump_path) + with pytest.raises(FileNotFoundError, match='but safeguard file'): + dumper_over_fail.dump() + + # 3. Test overwrite works with safeguard + (dump_path / DumpPaths.safeguard_file).touch() # Add safeguard + config_over_ok = DumpConfig(dump_mode=DumpMode.OVERWRITE, include_outputs=True) + dumper_over_ok = ProcessDumper(process=node, config=config_over_ok, output_path=dump_path) + dumper_over_ok.dump() + assert not (dump_path / 'dummy.txt').exists() # Should be removed + assert (dump_path / 'inputs' / '_aiidasubmit.sh').is_file() # Dump content exists + assert (dump_path / DumpPaths.log_file).is_file() # Log file created + + # 4. Test incremental works with safeguard (on existing dump) + (dump_path / 'extra_file.txt').touch() # Add another file + generate_calculation_node_add() # Create another node + + config_incr_ok = DumpConfig(dump_mode=DumpMode.INCREMENTAL, include_outputs=True) + # Dump the *same* first node again in incremental mode + dumper_incr_same = ProcessDumper(process=node, config=config_incr_ok, output_path=dump_path) + dumper_incr_same.dump() + + assert (dump_path / 'extra_file.txt').exists() # Incremental doesn't delete + assert (dump_path / 'inputs' / '_aiidasubmit.sh').is_file() # Original content still there + + +class TestGroupDumper: + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_add_group(self, tmp_path, setup_add_group): + add_group = setup_add_group + assert len(add_group.nodes) == 1 + node_pk = add_group.nodes[0].pk + + # Generate node tree + calc_tree = get_expected_add_calc_tree(pk=node_pk) + # Assemble group tree + expected_tree = get_expected_group_dump_tree(dump_label=add_group_dump_label, node_trees=[calc_tree]) + + output_path = tmp_path / add_group_dump_label + config = DumpConfig(filter_by_last_dump_time=False) # Ensure all nodes are dumped initially + group_dumper = GroupDumper(output_path=output_path, group=add_group, config=config) + group_dumper.dump() + + compare_tree(expected=expected_tree, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_multiply_add_group(self, tmp_path, setup_multiply_add_group): + multiply_add_group = setup_multiply_add_group + assert len(multiply_add_group.nodes) == 1 + wc_node = multiply_add_group.nodes[0] + wc_pk = wc_node.pk + child_pks = tuple(sorted([n.pk for n in wc_node.called_descendants])) + assert len(child_pks) == 2 + + # Generate node tree + wc_tree = get_expected_multiply_add_wc_tree(wc_pk=wc_pk, child_pks=child_pks) + # Assemble group tree + expected_tree = get_expected_group_dump_tree(dump_label=multiply_add_group_dump_label, node_trees=[wc_tree]) + + output_path = tmp_path / multiply_add_group_dump_label + # Rely on default config for incremental filter_by_last_dump_time=True + group_dumper = GroupDumper(output_path=output_path, group=multiply_add_group) + group_dumper.dump() + + compare_tree(expected=expected_tree, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_add_node_to_group(self, tmp_path, setup_add_group, generate_calculation_node_add): + add_group = setup_add_group + node1 = add_group.nodes[0] + node2 = generate_calculation_node_add() # Created but not in group yet + + output_path = tmp_path / add_group_dump_label + config = DumpConfig(filter_by_last_dump_time=False, dump_mode=DumpMode.INCREMENTAL) + group_dumper = GroupDumper(output_path=output_path, group=add_group, config=config) + + # Dump 1: Only node1 + group_dumper.dump() + tree1 = get_expected_group_dump_tree( + dump_label=add_group_dump_label, node_trees=[get_expected_add_calc_tree(node1.pk)] + ) + compare_tree(expected=tree1, base_path=tmp_path) + + # Add node2 to the group + add_group.add_nodes([node2]) + + # Dump 2: Both nodes + group_dumper.dump() # Re-run dump, should pick up group change + tree2 = get_expected_group_dump_tree( + dump_label=add_group_dump_label, + node_trees=[get_expected_add_calc_tree(node1.pk), get_expected_add_calc_tree(node2.pk)], + ) + compare_tree(expected=tree2, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_add_group_copy(self, tmp_path, setup_add_group): + add_group = setup_add_group + node1 = add_group.nodes[0] + copy_label = 'add-group-copy' + copy_dump_label = f'{copy_label}-dump' + dest_group, _ = orm.Group.collection.get_or_create(label=copy_label) + dest_group.add_nodes(list(add_group.nodes)) + + output_path = tmp_path / copy_dump_label + config = DumpConfig(filter_by_last_dump_time=False) + group_dumper = GroupDumper(output_path=output_path, group=dest_group, config=config) + group_dumper.dump() + + # Generate expected tree for the copied group dump + calc_tree = get_expected_add_calc_tree(pk=node1.pk) + expected_tree = get_expected_group_dump_tree(dump_label=copy_dump_label, node_trees=[calc_tree]) + compare_tree(expected=expected_tree, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_sub_calc_group(self, tmp_path, generate_workchain_multiply_add): + """Test dumping a group containing only sub-calculations of a workflow.""" + wf_node = generate_workchain_multiply_add() + sub_calcs = list(wf_node.called_descendants) + assert len(sub_calcs) == 2 + multiply_child = next(n for n in sub_calcs if 'multiply' in n.process_label) + add_child = next(n for n in sub_calcs if 'ArithmeticAdd' in n.process_label) + + group_label = 'sub-calc-group' + dump_label = f'{group_label}-dump' + group, _ = orm.Group.collection.get_or_create(label=group_label) + group.add_nodes(sub_calcs) + + output_path = tmp_path / dump_label + config = DumpConfig(filter_by_last_dump_time=False) + group_dumper = GroupDumper(output_path=output_path, group=group, config=config) + group_dumper.dump() + + # Generate expected tree for the group containing sub-calcs + multiply_tree = get_expected_multiply_func_tree(pk=multiply_child.pk) + add_tree = get_expected_add_calc_tree(pk=add_child.pk) + expected_tree = get_expected_group_dump_tree(dump_label=dump_label, node_trees=[multiply_tree, add_tree]) + compare_tree(expected=expected_tree, base_path=tmp_path) + + +class TestProfileDumper: + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_add_group(self, tmp_path, setup_add_group): + add_group = setup_add_group + assert len(add_group.nodes) == 1 + add_node_pk = add_group.nodes[0].pk + + # Generate node tree + calc_tree = get_expected_add_calc_tree(pk=add_node_pk) + # Assemble profile tree + expected_tree = get_expected_profile_dump_tree(groups_data={add_group.label: [calc_tree]}) + + config = DumpConfig(profile_dump_selection=ProfileDumpSelection.ALL) + profile_dumper = ProfileDumper(config=config, output_path=tmp_path / profile_dump_label) + profile_dumper.dump() + + compare_tree(expected=expected_tree, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_multiply_add_group(self, tmp_path, setup_multiply_add_group): + multiply_add_group = setup_multiply_add_group + assert len(multiply_add_group.nodes) == 1 + wc_node = multiply_add_group.nodes[0] + wc_pk = wc_node.pk + child_pks = tuple(sorted([_.pk for _ in wc_node.called_descendants])) # Ensure consistent order + assert len(child_pks) == 2, 'Expected 2 children for WC' + + # Generate node tree + wc_tree = get_expected_multiply_add_wc_tree(wc_pk=wc_pk, child_pks=child_pks) + # Assemble profile tree + expected_tree = get_expected_profile_dump_tree(groups_data={multiply_add_group.label: [wc_tree]}) + + config = DumpConfig(profile_dump_selection=ProfileDumpSelection.ALL) + profile_dumper = ProfileDumper(config=config, output_path=tmp_path / profile_dump_label) + profile_dumper.dump() + + compare_tree(expected=expected_tree, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_add_multiply_add_groups(self, tmp_path, setup_add_group, setup_multiply_add_group): + add_group = setup_add_group + multiply_add_group = setup_multiply_add_group + + # Get PKs + assert len(add_group.nodes) == 1 + add_node_pk = add_group.nodes[0].pk + assert len(multiply_add_group.nodes) == 1 + wc_node = multiply_add_group.nodes[0] + wc_pk = wc_node.pk + child_pks = tuple(sorted([_.pk for _ in wc_node.called_descendants])) + assert len(child_pks) == 2 + + # Generate individual node trees + calc_tree = get_expected_add_calc_tree(pk=add_node_pk) + wc_tree = get_expected_multiply_add_wc_tree(wc_pk=wc_pk, child_pks=child_pks) + + # Assemble profile tree + expected_tree = get_expected_profile_dump_tree( + groups_data={add_group.label: [calc_tree], multiply_add_group.label: [wc_tree]} + ) + + config = DumpConfig(profile_dump_selection=ProfileDumpSelection.ALL) + profile_dumper = ProfileDumper(config=config, output_path=tmp_path / profile_dump_label) + profile_dumper.dump() + + compare_tree(expected=expected_tree, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_multiply_add_add_groups(self, tmp_path, setup_add_group, setup_multiply_add_group): + # This test setup is identical to the previous one, just run in a different order + add_group = setup_add_group + multiply_add_group = setup_multiply_add_group + assert len(add_group.nodes) == 1 + add_node_pk = add_group.nodes[0].pk + assert len(multiply_add_group.nodes) == 1 + wc_node = multiply_add_group.nodes[0] + wc_pk = wc_node.pk + child_pks = tuple(sorted([_.pk for _ in wc_node.called_descendants])) + assert len(child_pks) == 2 + + calc_tree = get_expected_add_calc_tree(pk=add_node_pk) + wc_tree = get_expected_multiply_add_wc_tree(wc_pk=wc_pk, child_pks=child_pks) + + # Assemble profile tree (order of groups in dict doesn't matter for structure) + expected_tree = get_expected_profile_dump_tree( + groups_data={ + multiply_add_group.label: [wc_tree], # Different order here + add_group.label: [calc_tree], + } + ) + + config = DumpConfig(profile_dump_selection=ProfileDumpSelection.ALL) + profile_dumper = ProfileDumper(config=config, output_path=tmp_path / profile_dump_label) + profile_dumper.dump() + + compare_tree(expected=expected_tree, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_no_organize_by_groups(self, tmp_path, setup_add_group, setup_multiply_add_group): + add_group = setup_add_group + multiply_add_group = setup_multiply_add_group + assert len(add_group.nodes) == 1 + add_node_pk = add_group.nodes[0].pk + assert len(multiply_add_group.nodes) == 1 + wc_node = multiply_add_group.nodes[0] + wc_pk = wc_node.pk + child_pks = tuple(sorted([_.pk for _ in wc_node.called_descendants])) + assert len(child_pks) == 2 + + # Generate node trees + calc_tree = get_expected_add_calc_tree(pk=add_node_pk) + wc_tree = get_expected_multiply_add_wc_tree(wc_pk=wc_pk, child_pks=child_pks) + + # Assemble profile tree NOT organized by groups + expected_tree = get_expected_profile_dump_tree( + groups_data={ # Need to pass groups so nodes are selected + add_group.label: [calc_tree], + multiply_add_group.label: [wc_tree], + }, + organize_by_groups=False, + ) + + config = DumpConfig(profile_dump_selection=ProfileDumpSelection.ALL, organize_by_groups=False) + profile_dumper = ProfileDumper(output_path=tmp_path / profile_dump_label, config=config) + profile_dumper.dump() + + compare_tree(expected=expected_tree, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_also_ungrouped( + self, + tmp_path, + setup_add_group, + setup_multiply_add_group, + generate_calculation_node_add, + generate_workchain_multiply_add, + ): + """Tests dumping grouped and optionally ungrouped nodes.""" + # Setup grouped nodes + add_group = setup_add_group + multiply_add_group = setup_multiply_add_group + grouped_add_node = add_group.nodes[0] + grouped_wc_node = multiply_add_group.nodes[0] + grouped_wc_child_pks = tuple(sorted([n.pk for n in grouped_wc_node.called_descendants])) + + # Create ungrouped nodes + ungrouped_add_node = generate_calculation_node_add() + ungrouped_wc_node = generate_workchain_multiply_add() + ungrouped_wc_child_pks = tuple(sorted([n.pk for n in ungrouped_wc_node.called_descendants])) + + output_path = tmp_path / profile_dump_label + + # --- Dump 1: Only grouped nodes --- + config_grouped = DumpConfig(profile_dump_selection=ProfileDumpSelection.ALL, also_ungrouped=False) + profile_dumper_grouped = ProfileDumper(output_path=output_path, config=config_grouped) + profile_dumper_grouped.dump() + + # Generate expected tree for grouped nodes only + grouped_calc_tree = get_expected_add_calc_tree(pk=grouped_add_node.pk) + grouped_wc_tree = get_expected_multiply_add_wc_tree(wc_pk=grouped_wc_node.pk, child_pks=grouped_wc_child_pks) + expected_tree_grouped = get_expected_profile_dump_tree( + groups_data={ + add_group.label: [grouped_calc_tree], + multiply_add_group.label: [grouped_wc_tree], + }, + organize_by_groups=True, # Assuming default organization + ) + compare_tree(expected=expected_tree_grouped, base_path=tmp_path) + + # --- Dump 2: Include ungrouped nodes (incremental) --- + # Note: We re-instantiate dumper to ensure fresh state reading if needed, + # or rely on the dumper correctly handling incremental logic with config change. + # Re-instantiating is often safer in tests. + config_all = DumpConfig( + profile_dump_selection=ProfileDumpSelection.ALL, + also_ungrouped=True, + filter_by_last_dump_time=False, # Ensure all are dumped regardless of modification + dump_mode=DumpMode.INCREMENTAL, # Add to existing dump + ) + profile_dumper_all = ProfileDumper(output_path=output_path, config=config_all) + profile_dumper_all.dump() + + # Generate expected tree for all nodes + ungrouped_calc_tree = get_expected_add_calc_tree(pk=ungrouped_add_node.pk) + ungrouped_wc_tree = get_expected_multiply_add_wc_tree( + wc_pk=ungrouped_wc_node.pk, child_pks=ungrouped_wc_child_pks + ) + expected_tree_all = get_expected_profile_dump_tree( + groups_data={ + add_group.label: [grouped_calc_tree], + multiply_add_group.label: [grouped_wc_tree], + }, + ungrouped_data=[ungrouped_calc_tree, ungrouped_wc_tree], + organize_by_groups=True, + ) + compare_tree(expected=expected_tree_all, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_add_node_to_group(self, tmp_path, setup_add_group, generate_calculation_node_add): + add_group = setup_add_group + node1 = add_group.nodes[0] + node2 = generate_calculation_node_add() # Created but not in group yet + + output_path = tmp_path / profile_dump_label + config = DumpConfig(profile_dump_selection=ProfileDumpSelection.ALL, filter_by_last_dump_time=False) + profile_dumper = ProfileDumper(output_path=output_path, config=config) + + # Dump 1: Only node1 should be in the group dump + profile_dumper.dump() + tree1 = get_expected_profile_dump_tree(groups_data={add_group.label: [get_expected_add_calc_tree(node1.pk)]}) + compare_tree(expected=tree1, base_path=tmp_path) + + # Add node2 to the group + add_group.add_nodes([node2]) + + # Dump 2: Both nodes should be in the group dump + profile_dumper.dump() # Re-run dump, should pick up group change + tree2 = get_expected_profile_dump_tree( + groups_data={add_group.label: [get_expected_add_calc_tree(node1.pk), get_expected_add_calc_tree(node2.pk)]} + ) + compare_tree(expected=tree2, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_add_group_copy(self, tmp_path, setup_add_group): + add_group = setup_add_group + node1 = add_group.nodes[0] + copy_group_label = 'add-group-copy' + dest_group, _ = orm.Group.collection.get_or_create(label=copy_group_label) + dest_group.add_nodes(list(add_group.nodes)) + + output_path = tmp_path / profile_dump_label + config = DumpConfig(profile_dump_selection=ProfileDumpSelection.ALL, filter_by_last_dump_time=False) + profile_dumper = ProfileDumper(output_path=output_path, config=config) + profile_dumper.dump() + + # Generate expected tree with node1 in both groups + calc_tree = get_expected_add_calc_tree(pk=node1.pk) + expected_tree = get_expected_profile_dump_tree( + groups_data={ + add_group.label: [calc_tree], + copy_group_label: [calc_tree], # Node appears in both + } + ) + compare_tree(expected=expected_tree, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_add_group_copy_symlink(self, tmp_path, setup_add_group): + add_group = setup_add_group + node1 = add_group.nodes[0] + copy_group_label = 'add-group-copy' + dest_group, _ = orm.Group.collection.get_or_create(label=copy_group_label) + dest_group.add_nodes(list(add_group.nodes)) + + output_path = tmp_path / profile_dump_label + config = DumpConfig( + profile_dump_selection=ProfileDumpSelection.ALL, symlink_calcs=True, filter_by_last_dump_time=False + ) + profile_dumper = ProfileDumper(output_path=output_path, config=config) + profile_dumper.dump() + + # --- Symlink specific checks --- + node_dir_name = f'{node1.process_label}-{node1.pk}' + path_in_group1 = output_path / 'groups' / add_group.label / 'calculations' / node_dir_name + path_in_group2 = output_path / 'groups' / copy_group_label / 'calculations' / node_dir_name + + assert path_in_group1.is_dir() and not path_in_group1.is_symlink(), 'Source path should be a directory' + assert path_in_group2.is_symlink(), 'Second path should be a symlink' + assert path_in_group2.resolve() == path_in_group1.resolve(), 'Symlink target mismatch' + # --- End symlink checks --- + + # Check overall structure (compare_tree implicitly follows links) + calc_tree = get_expected_add_calc_tree(pk=node1.pk) + expected_tree = get_expected_profile_dump_tree( + groups_data={add_group.label: [calc_tree], copy_group_label: [calc_tree]} + ) + compare_tree(expected=expected_tree, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_sub_calc_group(self, tmp_path, generate_workchain_multiply_add): + """Test dumping a group containing only sub-calculations of a workflow.""" + wf_node = generate_workchain_multiply_add() + sub_calcs = list(wf_node.called_descendants) + assert len(sub_calcs) == 2 + multiply_child = next(n for n in sub_calcs if 'multiply' in n.process_label) + add_child = next(n for n in sub_calcs if 'ArithmeticAdd' in n.process_label) + + group_label = 'sub-calc-group' + group, _ = orm.Group.collection.get_or_create(label=group_label) + group.add_nodes(sub_calcs) + + output_path = tmp_path / profile_dump_label + config = DumpConfig(profile_dump_selection=ProfileDumpSelection.ALL, filter_by_last_dump_time=False) + profile_dumper = ProfileDumper(output_path=output_path, config=config) + profile_dumper.dump() + + # Generate expected tree (only sub-calcs in the group) + multiply_tree = get_expected_multiply_func_tree(pk=multiply_child.pk) + add_tree = get_expected_add_calc_tree(pk=add_child.pk) + expected_tree = get_expected_profile_dump_tree(groups_data={group_label: [multiply_tree, add_tree]}) + compare_tree(expected=expected_tree, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_delete_nodes(self, tmp_path, setup_add_group, setup_multiply_add_group): + from aiida.tools.graph.deletions import delete_nodes + + add_group = setup_add_group + multiply_add_group = setup_multiply_add_group + node_add = add_group.nodes[0] + node_wc = multiply_add_group.nodes[0] + wc_child_pks = tuple(sorted([n.pk for n in node_wc.called_descendants])) + + # === Store info needed AFTER deletion BEFORE deleting === + deleted_node_pk = node_add.pk + deleted_node_process_label = node_add.process_label + # ====================================================== + + output_path = tmp_path / profile_dump_label + config = DumpConfig(profile_dump_selection=ProfileDumpSelection.ALL, filter_by_last_dump_time=False) + profile_dumper = ProfileDumper(output_path=output_path, config=config) + + # Dump 1: Full initial state + profile_dumper.dump() + calc_tree = get_expected_add_calc_tree(pk=node_add.pk) # Getting pk here is fine + wc_tree = get_expected_multiply_add_wc_tree(wc_pk=node_wc.pk, child_pks=wc_child_pks) + initial_tree = get_expected_profile_dump_tree( + groups_data={add_group.label: [calc_tree], multiply_add_group.label: [wc_tree]} + ) + compare_tree(expected=initial_tree, base_path=tmp_path) + + # Delete the add node + delete_nodes(pks=[node_add.pk], dry_run=False) # Using pk is fine + + # Dump 2: Incremental dump with delete_missing=True + config_del = DumpConfig( + delete_missing=True, profile_dump_selection=ProfileDumpSelection.ALL, dump_mode=DumpMode.INCREMENTAL + ) + profile_dumper_del = ProfileDumper(output_path=output_path, config=config_del) + profile_dumper_del.dump() + + # Generate expected tree after deletion + final_tree = get_expected_profile_dump_tree( + groups_data={ + add_group.label: [], # Empty node list + multiply_add_group.label: [wc_tree], # wc_tree definition is still valid + } + ) + compare_tree(expected=final_tree, base_path=tmp_path) + + # Explicitly check the deleted node's directory is gone + # === Use the stored values === + deleted_node_dir = ( + output_path + / 'groups' + / add_group.label + / 'calculations' + / f'{deleted_node_process_label}-{deleted_node_pk}' + ) + # ============================ + assert not deleted_node_dir.exists(), 'Deleted node directory still exists' + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_delete_group(self, tmp_path, setup_add_group, setup_multiply_add_group): + add_group = setup_add_group + multiply_add_group = setup_multiply_add_group + node_add = add_group.nodes[0] + node_wc = multiply_add_group.nodes[0] + wc_child_pks = tuple(sorted([n.pk for n in node_wc.called_descendants])) + + output_path = tmp_path / profile_dump_label + config = DumpConfig(profile_dump_selection=ProfileDumpSelection.ALL, filter_by_last_dump_time=False) + profile_dumper = ProfileDumper(output_path=output_path, config=config) + + # Dump 1: Full initial state + profile_dumper.dump() + calc_tree_initial = get_expected_add_calc_tree(pk=node_add.pk) + wc_tree_initial = get_expected_multiply_add_wc_tree(wc_pk=node_wc.pk, child_pks=wc_child_pks) + initial_tree = get_expected_profile_dump_tree( + groups_data={add_group.label: [calc_tree_initial], multiply_add_group.label: [wc_tree_initial]} + ) + compare_tree(expected=initial_tree, base_path=tmp_path) + + # Delete the multiply_add_group (but not its nodes) + orm.Group.collection.delete(multiply_add_group.pk) + + # Dump 2: Incremental dump, should remove multiply_add_group dir + config_del = DumpConfig( + delete_missing=True, profile_dump_selection=ProfileDumpSelection.ALL, dump_mode=DumpMode.INCREMENTAL + ) + profile_dumper_del = ProfileDumper(output_path=output_path, config=config_del) + profile_dumper_del.dump() + + # Generate expected tree after group deletion + tree_after_del = get_expected_profile_dump_tree( + groups_data={add_group.label: [calc_tree_initial]} # Only add_group remains + ) + compare_tree(expected=tree_after_del, base_path=tmp_path) + # Check multiply_add_group dir is gone + assert not (output_path / 'groups' / multiply_add_group_label).exists() + + # Dump 3: Include ungrouped, should find the WC node now + config_ungrouped = DumpConfig( + also_ungrouped=True, + delete_missing=True, + profile_dump_selection=ProfileDumpSelection.ALL, + dump_mode=DumpMode.INCREMENTAL, + filter_by_last_dump_time=False, + ) + profile_dumper_ungrouped = ProfileDumper(output_path=output_path, config=config_ungrouped) + profile_dumper_ungrouped.dump() + + # Generate expected tree with WC node in ungrouped + tree_final = get_expected_profile_dump_tree( + groups_data={add_group.label: [calc_tree_initial]}, + ungrouped_data=[wc_tree_initial], # WC tree now appears here + ) + compare_tree(expected=tree_final, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_node_group_membership_change(self, tmp_path, setup_add_group, setup_multiply_add_group): + add_group = setup_add_group + multiply_add_group = setup_multiply_add_group + node_add = add_group.nodes[0] + node_wc = multiply_add_group.nodes[0] + wc_child_pks = tuple(sorted([n.pk for n in node_wc.called_descendants])) + + output_path = tmp_path / profile_dump_label + config = DumpConfig(profile_dump_selection=ProfileDumpSelection.ALL, filter_by_last_dump_time=False) + # Use a single dumper instance, relying on it to detect changes + profile_dumper = ProfileDumper(config=config, output_path=output_path) + + # Dump 1: Initial state + profile_dumper.dump() + calc_tree = get_expected_add_calc_tree(pk=node_add.pk) + wc_tree = get_expected_multiply_add_wc_tree(wc_pk=node_wc.pk, child_pks=wc_child_pks) + initial_tree = get_expected_profile_dump_tree( + groups_data={add_group.label: [calc_tree], multiply_add_group.label: [wc_tree]} + ) + compare_tree(expected=initial_tree, base_path=tmp_path) + + # Change membership + add_group.remove_nodes([node_add]) + multiply_add_group.add_nodes([node_add]) + + # Dump 2: Should reflect the change + profile_dumper.dump() + final_tree = get_expected_profile_dump_tree( + groups_data={ + add_group.label: [], # Now empty + multiply_add_group.label: [wc_tree, calc_tree], # Contains both now + } + ) + compare_tree(expected=final_tree, base_path=tmp_path) + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_add_group_relabel(self, tmp_path, setup_add_group): + add_group = setup_add_group + node_add = add_group.nodes[0] + old_label = add_group.label + new_label = 'add-group-relabelled' + + output_path = tmp_path / profile_dump_label + config = DumpConfig(profile_dump_selection=ProfileDumpSelection.ALL, filter_by_last_dump_time=False) + profile_dumper = ProfileDumper(config=config, output_path=output_path) + + # Dump 1: Initial state + profile_dumper.dump() + calc_tree = get_expected_add_calc_tree(pk=node_add.pk) + initial_tree = get_expected_profile_dump_tree(groups_data={old_label: [calc_tree]}) + compare_tree(expected=initial_tree, base_path=tmp_path) + + # Relabel the group + add_group.label = new_label + + # Dump 2: Update groups, should reflect relabeling + # Re-instantiate dumper or ensure config update is picked up + config_update = DumpConfig( + profile_dump_selection=ProfileDumpSelection.ALL, + update_groups=True, + dump_mode=DumpMode.INCREMENTAL, + filter_by_last_dump_time=False, + ) + profile_dumper_update = ProfileDumper(config=config_update, output_path=output_path) + profile_dumper_update.dump() + + # Generate expected tree with new label + final_tree = get_expected_profile_dump_tree(groups_data={new_label: [calc_tree]}) + compare_tree(expected=final_tree, base_path=tmp_path) + + # Verify old group directory is gone (assuming dumper removes it on relabel+update) + assert not (output_path / 'groups' / old_label).exists() + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_no_changes_early_return(self, tmp_path, setup_add_group, caplog): + """Tests that the dumper returns early if no changes are detected.""" + add_group = setup_add_group + node_add = add_group.nodes[0] + + output_path = tmp_path / profile_dump_label + config = DumpConfig(profile_dump_selection=ProfileDumpSelection.ALL) + profile_dumper = ProfileDumper(output_path=output_path, config=config) + + # Dump 1: Initial dump + profile_dumper.dump() + initial_tree = get_expected_profile_dump_tree( + groups_data={add_group.label: [get_expected_add_calc_tree(node_add.pk)]} + ) + compare_tree(expected=initial_tree, base_path=tmp_path) + + # Dump 2: No changes, check log message + caplog.clear() + with caplog.at_level(logging.REPORT, logger='aiida.tools.dumping.engine'): + profile_dumper.dump() # Should detect no changes via log + + assert ( + 'No changes detected since last dump' in caplog.text + ), "Engine did not log the expected 'No changes detected' message." + compare_tree(expected=initial_tree, base_path=tmp_path) # Structure remains identical + + @pytest.mark.usefixtures('aiida_profile_clean') + def test_dump_filter_by_last_dump_time(self, tmp_path, setup_add_group, generate_calculation_node_add): + """Tests that unmodified nodes are skipped in incremental dumps.""" + add_group = setup_add_group + original_node = add_group.nodes[0] + output_path = tmp_path / profile_dump_label + config = DumpConfig( + profile_dump_selection=ProfileDumpSelection.ALL, + filter_by_last_dump_time=True, # Explicitly enable (though default) + dump_mode=DumpMode.INCREMENTAL, # Essential for this test + ) + profile_dumper = ProfileDumper(output_path=output_path, config=config) + + # Dump 1: Initial dump + profile_dumper.dump() + original_calc_tree = get_expected_add_calc_tree(pk=original_node.pk) + initial_tree = get_expected_profile_dump_tree(groups_data={add_group.label: [original_calc_tree]}) + compare_tree(expected=initial_tree, base_path=tmp_path) + + # Record mtime + original_node_dir_name = f'{original_node.process_label}-{original_node.pk}' + original_node_dump_path = output_path / 'groups' / add_group.label / 'calculations' / original_node_dir_name + assert original_node_dump_path.exists() + mtime_orig_node_before = original_node_dump_path.stat().st_mtime + time.sleep(0.1) # Ensure timestamp changes + + # Add a new node + new_node = generate_calculation_node_add() + add_group.add_nodes([new_node]) + + # Dump 2: Incremental dump, filtering by time + profile_dumper.dump() # Config already set to incremental/filter_by_time + + # Check mtime of original node dir DID NOT change + mtime_orig_node_after = original_node_dump_path.stat().st_mtime + # Use approx comparison due to potential filesystem time resolution issues + assert ( + abs(mtime_orig_node_before - mtime_orig_node_after) < 0.1 + ), 'Original node dump directory was modified in time-filtered incremental update' + + # Check new node dir DOES exist + new_node_dir_name = f'{new_node.process_label}-{new_node.pk}' + new_node_dump_path = output_path / 'groups' / add_group.label / 'calculations' / new_node_dir_name + assert new_node_dump_path.is_dir(), 'New node was not dumped' + + # Verify final overall structure contains both + new_calc_tree = get_expected_add_calc_tree(pk=new_node.pk) + final_expected_tree = get_expected_profile_dump_tree( + groups_data={add_group.label: [original_calc_tree, new_calc_tree]} + ) + compare_tree(expected=final_expected_tree, base_path=tmp_path) diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py deleted file mode 100644 index accfbd17d2..0000000000 --- a/tests/tools/dumping/test_processes.py +++ /dev/null @@ -1,517 +0,0 @@ -########################################################################### -# Copyright (c), The AiiDA team. All rights reserved. # -# This file is part of the AiiDA code. # -# # -# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # -# For further information on the license, see the LICENSE.txt file # -# For further information please visit http://www.aiida.net # -########################################################################### -"""Tests for the dumping of ProcessNode data to disk.""" - -from __future__ import annotations - -import io -import shutil -from pathlib import Path - -import pytest - -from aiida.common.links import LinkType -from aiida.tools.dumping.processes import ProcessDumper - -# Non-AiiDA variables -filename = 'file.txt' -filecontent = 'a' -inputs_relpath = Path('inputs') -outputs_relpath = Path('outputs') -node_inputs_relpath = Path('node_inputs') -node_outputs_relpath = Path('node_outputs') -default_dump_paths = [inputs_relpath, outputs_relpath, node_inputs_relpath, node_outputs_relpath] -custom_dump_paths = [f'{path}_' for path in default_dump_paths] - -# Define variables used for constructing the nodes used to test the dumping -singlefiledata_linklabel = 'singlefile' -folderdata_linklabel = 'folderdata' -folderdata_relpath = Path('relative_path') -folderdata_test_path = folderdata_linklabel / folderdata_relpath -arraydata_linklabel = 'arraydata' -node_metadata_file = '.aiida_node_metadata.yaml' - - -# Helper functions to generate the actual `WorkflowNode`s and `CalculationNode`s used for testing -@pytest.fixture -def generate_calculation_node_io(generate_calculation_node, tmp_path): - def _generate_calculation_node_io(entry_point: str | None = None, attach_outputs: bool = True): - import numpy as np - - from aiida.orm import ArrayData, FolderData, SinglefileData - - singlefiledata_input = SinglefileData.from_string(content=filecontent, filename=filename) - # ? Use instance for folderdata - folderdata = FolderData() - folderdata.put_object_from_filelike(handle=io.StringIO(filecontent), path=str(folderdata_relpath / filename)) # type: ignore[arg-type] - arraydata_input = ArrayData(arrays=np.ones(3)) - - # Create calculation inputs, outputs - calculation_node_inputs = { - singlefiledata_linklabel: singlefiledata_input, - folderdata_linklabel: folderdata, - arraydata_linklabel: arraydata_input, - } - - singlefiledata_output = singlefiledata_input.clone() - folderdata_output = folderdata.clone() - - if attach_outputs: - calculation_outputs = { - folderdata_linklabel: folderdata_output, - singlefiledata_linklabel: singlefiledata_output, - } - else: - calculation_outputs = None - - # Actually write repository file and then read it in when generating calculation_node - (tmp_path / filename).write_text(filecontent) - - calculation_node = generate_calculation_node( - repository=tmp_path, - inputs=calculation_node_inputs, - outputs=calculation_outputs, - entry_point=entry_point, - ) - return calculation_node - - return _generate_calculation_node_io - - -@pytest.fixture -def generate_workchain_node_io(): - def _generate_workchain_node_io(cj_nodes, store_all: bool = True): - """Generate an instance of a `WorkChain` that contains a sub-`WorkChain` and a `Calculation` with file io.""" - from aiida.orm import WorkflowNode - - wc_node = WorkflowNode() - wc_node_sub = WorkflowNode() - - # Add sub-workchain that calls a calculation - wc_node_sub.base.links.add_incoming(wc_node, link_type=LinkType.CALL_WORK, link_label='sub_workflow') - for cj_node in cj_nodes: - cj_node.base.links.add_incoming(wc_node_sub, link_type=LinkType.CALL_CALC, link_label='calculation') - - # Set process_state so that tests don't throw exception for build_call_graph of README generation - [cj_node.set_process_state('finished') for cj_node in cj_nodes] - wc_node.set_process_state('finished') - wc_node_sub.set_process_state('finished') - - # Need to store so that outputs are being dumped - if store_all: - wc_node.store() - wc_node_sub.store() - [cj_node.store() for cj_node in cj_nodes] - - return wc_node - - return _generate_workchain_node_io - - -# Only test top-level actions, like path and README creation -# Other things tested via `_dump_workflow` and `_dump_calculation` -def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path): - from aiida.tools.archive.exceptions import ExportValidationError - - dump_parent_path = tmp_path / 'wc-dump-test-io' - process_dumper = ProcessDumper() - # Don't attach outputs, as it would require storing the calculation_node and then it cannot be used in the workchain - cj_nodes = [generate_calculation_node_io(attach_outputs=False), generate_calculation_node_io(attach_outputs=False)] - wc_node = generate_workchain_node_io(cj_nodes=cj_nodes) - - # Raises if ProcessNode not sealed - with pytest.raises(ExportValidationError): - return_path = process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) - - wc_node.seal() - return_path = process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) - - assert dump_parent_path.is_dir() - assert (dump_parent_path / 'README.md').is_file() - assert return_path == dump_parent_path - - -def test_dump_workflow(generate_calculation_node_io, generate_workchain_node_io, tmp_path): - # Need to generate parent path for dumping, as I don't want the sub-workchains to be dumped directly into `tmp_path` - dump_parent_path = tmp_path / 'wc-workflow_dump-test-io' - process_dumper = ProcessDumper() - # Don't attach outputs, as it would require storing the calculation_node and then it cannot be used in the workchain - cj_nodes = [generate_calculation_node_io(attach_outputs=False), generate_calculation_node_io(attach_outputs=False)] - wc_node = generate_workchain_node_io(cj_nodes=cj_nodes) - process_dumper._dump_workflow(workflow_node=wc_node, output_path=dump_parent_path) - - input_path = '01-sub_workflow/01-calculation/inputs/file.txt' - singlefiledata_path = '01-sub_workflow/01-calculation/node_inputs/singlefile/file.txt' - folderdata_path = '01-sub_workflow/01-calculation/node_inputs/folderdata/relative_path/file.txt' - arraydata_path = '01-sub_workflow/01-calculation/node_inputs/arraydata/default.npy' - node_metadata_paths = [ - node_metadata_file, - f'01-sub_workflow/{node_metadata_file}', - f'01-sub_workflow/01-calculation/{node_metadata_file}', - f'01-sub_workflow/02-calculation/{node_metadata_file}', - ] - - expected_files = [input_path, singlefiledata_path, folderdata_path, arraydata_path, *node_metadata_paths] - expected_files = [dump_parent_path / expected_file for expected_file in expected_files] - - assert all([expected_file.is_file() for expected_file in expected_files]) - - # Flat dumping - dump_parent_path = tmp_path / 'wc-dump-test-io-flat' - process_dumper = ProcessDumper(flat=True) - process_dumper._dump_workflow(workflow_node=wc_node, output_path=dump_parent_path) - - input_path = '01-sub_workflow/01-calculation/file.txt' - arraydata_path = '01-sub_workflow/01-calculation/default.npy' - folderdata_path = '01-sub_workflow/01-calculation/relative_path/file.txt' - node_metadata_paths = [ - node_metadata_file, - f'01-sub_workflow/{node_metadata_file}', - f'01-sub_workflow/01-calculation/{node_metadata_file}', - f'01-sub_workflow/02-calculation/{node_metadata_file}', - ] - - expected_files = [input_path, folderdata_path, arraydata_path, *node_metadata_paths] - expected_files = [dump_parent_path / expected_file for expected_file in expected_files] - - assert all([expected_file.is_file() for expected_file in expected_files]) - - -def test_dump_multiply_add(tmp_path, generate_workchain_multiply_add): - dump_parent_path = tmp_path / 'wc-dump-test-multiply-add' - process_dumper = ProcessDumper() - wc_node = generate_workchain_multiply_add() - process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) - - input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] - output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] - input_files = [ - dump_parent_path / '02-ArithmeticAddCalculation' / inputs_relpath / input_file for input_file in input_files - ] - input_files += [dump_parent_path / '01-multiply' / inputs_relpath / 'source_file'] - output_files = [ - dump_parent_path / '02-ArithmeticAddCalculation' / outputs_relpath / output_file for output_file in output_files - ] - - # No node_inputs contained in MultiplyAddWorkChain - assert all([input_file.is_file() for input_file in input_files]) - assert all([output_file.is_file() for output_file in output_files]) - - # Flat dumping - dump_parent_path = tmp_path / 'wc-dump-test-multiply-add-flat' - process_dumper = ProcessDumper(flat=True) - process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) - - multiply_file = dump_parent_path / '01-multiply' / 'source_file' - arithmetic_add_files = [ - '_aiidasubmit.sh', - 'aiida.in', - '.aiida/job_tmpl.json', - '.aiida/calcinfo.json', - '_scheduler-stderr.txt', - '_scheduler-stdout.txt', - 'aiida.out', - ] - arithmetic_add_files = [ - dump_parent_path / '02-ArithmeticAddCalculation' / arithmetic_add_file - for arithmetic_add_file in arithmetic_add_files - ] - - assert multiply_file.is_file() - assert all([expected_file.is_file() for expected_file in arithmetic_add_files]) - - -# Tests for dump_calculation method -def test_dump_calculation_node(tmp_path, generate_calculation_node_io): - # Checking the actual content should be handled by `test_copy_tree` - - # Normal dumping -> node_inputs and not flat; no paths provided - dump_parent_path = tmp_path / 'cj-dump-test-io' - process_dumper = ProcessDumper(include_outputs=True) - calculation_node = generate_calculation_node_io() - process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) - - assert (dump_parent_path / inputs_relpath / filename).is_file() - assert (dump_parent_path / node_inputs_relpath / singlefiledata_linklabel / filename).is_file() - assert (dump_parent_path / node_inputs_relpath / folderdata_test_path / filename).is_file() - assert (dump_parent_path / node_inputs_relpath / arraydata_linklabel / 'default.npy').is_file() - - assert (dump_parent_path / node_outputs_relpath / singlefiledata_linklabel / filename).is_file() - assert (dump_parent_path / node_outputs_relpath / folderdata_test_path / filename).is_file() - - # Check contents once - with open(dump_parent_path / inputs_relpath / filename, 'r') as handle: - assert handle.read() == filecontent - with open(dump_parent_path / node_inputs_relpath / singlefiledata_linklabel / filename) as handle: - assert handle.read() == filecontent - with open(dump_parent_path / node_inputs_relpath / folderdata_test_path / filename) as handle: - assert handle.read() == filecontent - with open(dump_parent_path / node_outputs_relpath / singlefiledata_linklabel / filename) as handle: - assert handle.read() == filecontent - with open(dump_parent_path / node_outputs_relpath / folderdata_test_path / filename) as handle: - assert handle.read() == filecontent - - -def test_dump_calculation_flat(tmp_path, generate_calculation_node_io): - # Flat dumping -> no paths provided -> Default paths should not be existent. - # Internal FolderData structure retained. - dump_parent_path = tmp_path / 'cj-dump-test-custom' - process_dumper = ProcessDumper(flat=True) - calculation_node = generate_calculation_node_io() - process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) - - # Here, the same file will be written by inputs and node_outputs and node_inputs - # So it should only be present once in the parent dump directory - assert not (dump_parent_path / inputs_relpath).is_dir() - assert not (dump_parent_path / node_inputs_relpath).is_dir() - assert not (dump_parent_path / outputs_relpath).is_dir() - assert (dump_parent_path / filename).is_file() - assert (dump_parent_path / 'default.npy').is_file() - assert (dump_parent_path / folderdata_relpath / filename).is_file() - - -# Here, in principle, test only non-default arguments, as defaults tested above -def test_dump_calculation_overwr_incr(tmp_path, generate_calculation_node_io): - """Tests the ProcessDumper for the overwrite and incremental option.""" - dump_parent_path = tmp_path / 'cj-dump-test-overwrite' - process_dumper = ProcessDumper(overwrite=False, incremental=False) - calculation_node = generate_calculation_node_io() - calculation_node.seal() - # Create safeguard file to mock existing dump directory - dump_parent_path.mkdir() - # we create safeguard file so the dumping works - (dump_parent_path / '.aiida_node_metadata.yaml').touch() - with pytest.raises(FileExistsError): - process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) - # With overwrite option true no error is raised and the dumping can run through. - process_dumper = ProcessDumper(overwrite=True, incremental=False) - process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) - assert (dump_parent_path / inputs_relpath / filename).is_file() - - shutil.rmtree(dump_parent_path) - - # Incremental also does work - dump_parent_path.mkdir() - (dump_parent_path / '.aiida_node_metadata.yaml').touch() - process_dumper = ProcessDumper(overwrite=False, incremental=True) - process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) - assert (dump_parent_path / inputs_relpath / filename).is_file() - - -# With both inputs and outputs being dumped is the standard test case above, so only test without inputs here -def test_dump_calculation_no_inputs(tmp_path, generate_calculation_node_io): - dump_parent_path = tmp_path / 'cj-dump-test-noinputs' - process_dumper = ProcessDumper(include_inputs=False) - calculation_node = generate_calculation_node_io() - process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) - assert not (dump_parent_path / node_inputs_relpath).is_dir() - - -def test_dump_calculation_add(tmp_path, generate_calculation_node_add): - dump_parent_path = tmp_path / 'cj-dump-test-add' - - process_dumper = ProcessDumper() - calculation_node_add = generate_calculation_node_add() - process_dumper._dump_calculation(calculation_node=calculation_node_add, output_path=dump_parent_path) - - input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] - output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] - input_files = [dump_parent_path / inputs_relpath / input_file for input_file in input_files] - output_files = [dump_parent_path / outputs_relpath / output_file for output_file in output_files] - - assert all([input_file.is_file() for input_file in input_files]) - assert all([output_file.is_file() for output_file in output_files]) - - -# Tests for helper methods -@pytest.mark.usefixtures('chdir_tmp_path') -def test_prepare_dump_path(tmp_path): - from aiida.tools.dumping.utils import prepare_dump_path - - test_dir = tmp_path / Path('test-dir') - test_file = test_dir / filename - safeguard_file = node_metadata_file - safeguard_file_path = test_dir / safeguard_file - - # Cannot set both overwrite and incremental to True - with pytest.raises(ValueError): - prepare_dump_path(path_to_validate=test_dir, overwrite=True, incremental=True) - - # Check that fails if file with same name as output dir - test_dir.touch() - with pytest.raises(FileExistsError): - prepare_dump_path(path_to_validate=test_dir) - test_dir.unlink() - - # Check if path created if non-existent - prepare_dump_path(path_to_validate=test_dir) - assert test_dir.exists() - assert safeguard_file_path.is_file() - - # Directory exists, but empty -> is fine - safeguard_file_path.unlink() - prepare_dump_path(path_to_validate=test_dir) - assert test_dir.exists() - assert safeguard_file_path.is_file() - - # Fails if directory not empty, safeguard file existent, and overwrite set to False - test_file.touch() - safeguard_file_path.touch() - with pytest.raises(FileExistsError): - prepare_dump_path(path_to_validate=test_dir, overwrite=False, incremental=False) - - # Fails if directory not empty, overwrite set to True, but safeguard_file not found (for safety reasons) - safeguard_file_path.unlink() - test_file.touch() - with pytest.raises(FileNotFoundError): - prepare_dump_path(path_to_validate=test_dir, overwrite=True, incremental=False) - - # Works if directory not empty, overwrite set to True and safeguard_file contained - # -> After function call, test_file is deleted, and safeguard_file again created - safeguard_file_path.touch() - prepare_dump_path( - path_to_validate=test_dir, - safeguard_file=safeguard_file, - overwrite=True, - incremental=False, - ) - assert not test_file.is_file() - assert safeguard_file_path.is_file() - - # Works if directory not empty, but incremental=True and safeguard_file (e.g. `.aiida_node_metadata.yaml`) contained - # -> After function call, test file and safeguard_file still there - test_file.touch() - prepare_dump_path(path_to_validate=test_dir, safeguard_file=safeguard_file, incremental=True) - assert safeguard_file_path.is_file() - assert test_file.is_file() - - -def test_generate_default_dump_path( - generate_calculation_node_add, - generate_workchain_multiply_add, -): - process_dumper = ProcessDumper() - add_node = generate_calculation_node_add() - multiply_add_node = generate_workchain_multiply_add() - add_path = process_dumper._generate_default_dump_path(process_node=add_node) - multiply_add_path = process_dumper._generate_default_dump_path(process_node=multiply_add_node) - - assert str(add_path) == f'dump-ArithmeticAddCalculation-{add_node.pk}' - assert str(multiply_add_path) == f'dump-MultiplyAddWorkChain-{multiply_add_node.pk}' - - -def test_generate_calculation_io_mapping(): - process_dumper = ProcessDumper() - calculation_io_mapping = process_dumper._generate_calculation_io_mapping() - assert calculation_io_mapping.repository == 'inputs' - assert calculation_io_mapping.retrieved == 'outputs' - assert calculation_io_mapping.inputs == 'node_inputs' - assert calculation_io_mapping.outputs == 'node_outputs' - - calculation_io_mapping = process_dumper._generate_calculation_io_mapping(io_dump_paths=custom_dump_paths) - assert calculation_io_mapping.repository == 'inputs_' - assert calculation_io_mapping.retrieved == 'outputs_' - assert calculation_io_mapping.inputs == 'node_inputs_' - assert calculation_io_mapping.outputs == 'node_outputs_' - - -def test_generate_child_node_label( - generate_workchain_multiply_add, generate_calculation_node_io, generate_workchain_node_io -): - # Check with manually constructed, more complex workchain - cj_node = generate_calculation_node_io(attach_outputs=False) - wc_node = generate_workchain_node_io(cj_nodes=[cj_node]) - wc_output_triples = wc_node.base.links.get_outgoing().all() - sub_wc_node = wc_output_triples[0].node - - output_triples = wc_output_triples + sub_wc_node.base.links.get_outgoing().all() - # Sort by mtime here, not ctime, as I'm actually creating the CalculationNode first. - output_triples = sorted(output_triples, key=lambda link_triple: link_triple.node.mtime) - - process_dumper = ProcessDumper() - - output_paths = sorted( - [ - process_dumper._generate_child_node_label(index, output_node) - for index, output_node in enumerate(output_triples) - ] - ) - assert output_paths == ['00-sub_workflow', '01-calculation'] - - # Check with multiply_add workchain node - multiply_add_node = generate_workchain_multiply_add() - output_triples = multiply_add_node.base.links.get_outgoing().all() - # Sort by ctime here, not mtime, as I'm generating the WorkChain normally - output_triples = sorted(output_triples, key=lambda link_triple: link_triple.node.ctime) - output_paths = sorted( - [process_dumper._generate_child_node_label(_, output_node) for _, output_node in enumerate(output_triples)] - ) - assert output_paths == ['00-multiply', '01-ArithmeticAddCalculation', '02-result'] - - -def test_dump_node_yaml(generate_calculation_node_io, tmp_path, generate_workchain_multiply_add): - process_dumper = ProcessDumper() - cj_node = generate_calculation_node_io(attach_outputs=False) - process_dumper._dump_node_yaml(process_node=cj_node, output_path=tmp_path) - - assert (tmp_path / node_metadata_file).is_file() - - # Test with multiply_add - wc_node = generate_workchain_multiply_add() - process_dumper._dump_node_yaml(process_node=wc_node, output_path=tmp_path) - - assert (tmp_path / node_metadata_file).is_file() - - # Open the dumped YAML file and read its contents - with open(tmp_path / node_metadata_file, 'r') as dumped_file: - contents = dumped_file.read() - - # Check if contents as expected - assert 'Node data:' in contents - assert 'User data:' in contents - # Computer is None for the locally run MultiplyAdd - assert 'Computer data:' not in contents - assert 'Node attributes:' in contents - assert 'Node extras:' in contents - - process_dumper = ProcessDumper(include_attributes=False, include_extras=False) - - process_dumper._dump_node_yaml(process_node=wc_node, output_path=tmp_path) - - # Open the dumped YAML file and read its contents - with open(tmp_path / node_metadata_file, 'r') as dumped_file: - contents = dumped_file.read() - - # Check if contents as expected -> No attributes and extras - assert 'Node data:' in contents - assert 'User data:' in contents - # Computer is None for the locally run MultiplyAdd - assert 'Computer data:' not in contents - assert 'Node attributes:' not in contents - assert 'Node extras:' not in contents - - -def test_generate_parent_readme(tmp_path, generate_workchain_multiply_add): - wc_node = generate_workchain_multiply_add() - process_dumper = ProcessDumper() - - process_dumper._generate_readme(process_node=wc_node, output_path=tmp_path) - - assert (tmp_path / 'README.md').is_file() - - with open(tmp_path / 'README.md', 'r') as dumped_file: - contents = dumped_file.read() - - assert 'This directory contains' in contents - assert '`MultiplyAddWorkChain' in contents - assert 'ArithmeticAddCalculation' in contents - # Check for outputs of `verdi process status/report/show` - assert 'Finished [0] [3:result]' in contents - assert 'Property Value' in contents - assert 'There are 1 log messages for this calculation' in contents diff --git a/tests/tools/dumping/utils.py b/tests/tools/dumping/utils.py new file mode 100644 index 0000000000..8295ae88e7 --- /dev/null +++ b/tests/tools/dumping/utils.py @@ -0,0 +1,174 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### + +from pathlib import Path +from typing import Any + +# TODO: Possibly move this under test directory + + +def tree_to_dict(root_path: Path) -> dict[str, list[Any]]: + """ + Convert a directory tree structure into a dictionary representation. + + The representation follows this format: + - Each directory is represented as a dictionary with the directory name as key + and a list of its contents as value + - Files are represented as strings in the list + - Subdirectories are represented as dictionaries in the list + + Args: + root_path (Path): The root directory to convert + + Returns: + Dict[str, List[Any]]: Dictionary representation of the directory structure + """ + if not root_path.exists() or not root_path.is_dir(): + raise ValueError(f'The path {root_path} does not exist or is not a directory') + + # Get the directory name + dir_name = root_path.name + + # Initialize the content list for this directory + contents = [] + + # Process all entries in the directory (sorted alphabetically) + entries = sorted(root_path.iterdir(), key=lambda p: p.name) + + # First, add all files to the content list + for entry in entries: + if entry.is_file(): + contents.append(entry.name) + + # Then, recursively process all directories + for entry in entries: + if entry.is_dir(): + # Create a dictionary for this subdirectory + subdir_dict = tree_to_dict(entry) + contents.append(subdir_dict) + + # Return the directory as a dictionary with its contents + return {dir_name: contents} + + +def tree_to_dict_dirs_only(root_path: Path) -> dict[str, list[Any]]: + """ + Convert a directory tree structure into a dictionary representation, + including only directories and ignoring files. + + The representation follows this format: + - Each directory is represented as a dictionary with the directory name as key + and a list of its subdirectories as value + - Only directories are included, files are completely ignored + - Subdirectories are represented as nested dictionaries in the list + + Args: + root_path (Path): The root directory to convert + + Returns: + Dict[str, List[Any]]: Dictionary representation of the directory structure + containing only directories + """ + if not root_path.exists() or not root_path.is_dir(): + raise ValueError(f'The path {root_path} does not exist or is not a directory') + + # Get the directory name + dir_name = root_path.name + + # Initialize the content list for this directory + contents = [] + + # Get all subdirectories in the current directory (sorted alphabetically) + subdirs = sorted([entry for entry in root_path.iterdir() if entry.is_dir()], key=lambda p: p.name) + + # Recursively process all subdirectories + for subdir in subdirs: + # Create a dictionary for this subdirectory + subdir_dict = tree_to_dict_dirs_only(subdir) + contents.append(subdir_dict) + + # Return the directory as a dictionary with its contents + return {dir_name: contents} + + +def compare_tree(expected: dict, base_path: Path, relative_path: Path = Path()): + """Recursively compares an expected directory structure with an actual path. + Verifies both that all expected elements exist and that no unexpected elements exist. + + Args: + expected (dict): The expected directory structure. + base_path (Path): The root directory where the actual structure is located. + relative_path (Path): The relative path inside the base directory (used internally for recursion). + """ + for dir_name, content_list in expected.items(): + dir_path = base_path / relative_path / dir_name + + assert dir_path.exists(), f'Path does not exist: {dir_path}' + assert dir_path.is_dir(), f'Path is not a directory: {dir_path}' + + # Extract all expected files and subdirectories at this level + expected_entries = set() + expected_dirs = {} + + for item in content_list: + if isinstance(item, str): # It's a file + expected_entries.add(item) + file_path = dir_path / item + assert file_path.exists(), f'Missing file: {file_path}' + assert file_path.is_file(), f'Expected a file: {file_path}' + elif isinstance(item, dict): # It's a subdirectory + # Get the subdirectory name (the first key in the dict) + subdir_name = next(iter(item)) + expected_entries.add(subdir_name) + expected_dirs[subdir_name] = item + # Recursively check the subdirectory + compare_tree(item, base_path, relative_path / dir_name) + + # Check for unexpected entries + actual_entries = set(entry.name for entry in dir_path.iterdir()) + unexpected_entries = actual_entries - expected_entries + + assert not unexpected_entries, f'Unexpected entries found in {dir_path}: {unexpected_entries}' + + +def compare_tree_dirs_only(expected: dict, base_path: Path, relative_path: Path = Path()): + """Recursively compares an expected directory structure with an actual path, + focusing only on directories and ignoring files. + + Args: + expected (dict): The expected directory structure. + base_path (Path): The root directory where the actual structure is located. + relative_path (Path): The relative path inside the base directory (used internally for recursion). + """ + for dir_name, content_list in expected.items(): + dir_path = base_path / relative_path / dir_name + + assert dir_path.exists(), f'Path does not exist: {dir_path}' + assert dir_path.is_dir(), f'Path is not a directory: {dir_path}' + + # Extract all expected subdirectories at this level + expected_dirs = {} + + for item in content_list: + if isinstance(item, dict): # It's a subdirectory + # Get the subdirectory name (the first key in the dict) + subdir_name = next(iter(item)) + expected_dirs[subdir_name] = item + + # Check for unexpected directories + actual_dirs = {entry.name: entry for entry in dir_path.iterdir() if entry.is_dir()} + unexpected_dirs = set(actual_dirs.keys()) - set(expected_dirs.keys()) + + assert not unexpected_dirs, f'Unexpected directories found in {dir_path}: {unexpected_dirs}' + + assert not unexpected_dirs, f'Unexpected directories found in {dir_path}: {unexpected_dirs}' + + # Recursively check the expected subdirectories + for subdir_name, subdir_content in expected_dirs.items(): + compare_tree_dirs_only(subdir_content, base_path, relative_path / dir_name) From 4d5d2c4e09e8d381f9838493595f13a67bf00abb Mon Sep 17 00:00:00 2001 From: Daniel Hollas Date: Wed, 7 May 2025 21:08:51 +0100 Subject: [PATCH 3/5] Add defer_build to pydantic models for faster aiida.orm import time (#6867) Squashed commit at 2025-05-09 22:04 defer_build in Entity.Model class defer_build in Entity.Model pydantic class --- src/aiida/orm/entities.py | 2 +- src/aiida/orm/utils/mixins.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/aiida/orm/entities.py b/src/aiida/orm/entities.py index 7d6a185247..df6d318d6a 100644 --- a/src/aiida/orm/entities.py +++ b/src/aiida/orm/entities.py @@ -181,7 +181,7 @@ class Entity(abc.ABC, Generic[BackendEntityType, CollectionType], metaclass=Enti _CLS_COLLECTION: Type[CollectionType] = Collection # type: ignore[assignment] _logger = log.AIIDA_LOGGER.getChild('orm.entities') - class Model(BaseModel): + class Model(BaseModel, defer_build=True): pk: Optional[int] = MetadataField( None, description='The primary key of the entity. Can be `None` if the entity is not yet stored.', diff --git a/src/aiida/orm/utils/mixins.py b/src/aiida/orm/utils/mixins.py index fd022de2ce..4d8379079a 100644 --- a/src/aiida/orm/utils/mixins.py +++ b/src/aiida/orm/utils/mixins.py @@ -183,7 +183,7 @@ class Sealable: SEALED_KEY = 'sealed' - class Model(pydantic.BaseModel): + class Model(pydantic.BaseModel, defer_build=True): sealed: bool = MetadataField(description='Whether the node is sealed') @classproperty From 98177756ea4281926cf30d1857e98091c15adcc5 Mon Sep 17 00:00:00 2001 From: Edan Bainglass Date: Fri, 9 May 2025 08:27:52 +0200 Subject: [PATCH 4/5] Implement a cache for QB fields construction #6869 Cherry-picked at 2025-05-09 22:06 This aims to avoid reconstruction of fields from parent models during MRO walks. --- src/aiida/orm/fields.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/aiida/orm/fields.py b/src/aiida/orm/fields.py index f0e8ca390b..35b07f601c 100644 --- a/src/aiida/orm/fields.py +++ b/src/aiida/orm/fields.py @@ -396,6 +396,8 @@ def _dict(self): class EntityFieldMeta(ABCMeta): """A metaclass for entity fields, which adds a `fields` class attribute.""" + _model_fields_cache: t.Dict[t.Type[BaseModel], QbFields] = {} + def __init__(cls, name, bases, classdict): super().__init__(name, bases, classdict) @@ -404,12 +406,17 @@ def __init__(cls, name, bases, classdict): if current_fields is not None and not isinstance(current_fields, QbFields): raise ValueError(f"class '{cls}' already has a `fields` attribute set") - fields = {} - # If the class has an attribute ``Model`` that is a subclass of :class:`pydantic.BaseModel`, parse the model # fields to build up the ``fields`` class attribute, which is used to allow specifying ``QueryBuilder`` filters # programmatically. if hasattr(cls, 'Model') and issubclass(cls.Model, BaseModel): + # If the class has a ``Model``, check if we have already parsed it and cached + # the result. Use the cached result if available. + cache = cls._model_fields_cache + if cls.Model in cache: + cls.fields = cache[cls.Model] + return + # If the class itself directly specifies the ``Model`` attribute, check that it is valid. Here, the check # ``cls.__dict__`` is used instead of ``hasattr`` as the former only returns true if the class itself # defines the attribute and does not just inherit it from a base class. In that case, this check will @@ -461,8 +468,8 @@ def __init__(cls, name, bases, classdict): f'`class Model({", ".join(sorted(bases))}):`' ) - for key, field in cls.Model.model_fields.items(): - fields[key] = add_field( + fields = { + key: add_field( key, alias=get_metadata(field, 'alias', None), dtype=field.annotation, @@ -470,8 +477,13 @@ def __init__(cls, name, bases, classdict): is_attribute=get_metadata(field, 'is_attribute', False), is_subscriptable=get_metadata(field, 'is_subscriptable', False), ) + for key, field in cls.Model.model_fields.items() + } - cls.fields = QbFields({key: fields[key] for key in sorted(fields)}) + cls.fields = QbFields({key: fields[key] for key in sorted(fields)}) + cls._model_fields_cache[cls.Model] = cls.fields + else: + cls.fields = QbFields() class QbFieldArguments(t.TypedDict): From e4205e923dc10d401beca7693210457b54ec273d Mon Sep 17 00:00:00 2001 From: Alexander Goscinski Date: Fri, 9 May 2025 14:41:06 +0200 Subject: [PATCH 5/5] Release `v2.7.0pre1` --- CHANGELOG.md | 305 ++++++++++++++++++++++++++++++++++++++++++ src/aiida/__init__.py | 2 +- 2 files changed, 306 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 04327d7105..6133cd5ec6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,310 @@ # Changelog +## v2.7.0 - 2025-05-09 + +### Highlights + +#### Asynchronous SSH connection [#6626](https://github.com/aiidateam/aiida-core/pull/6626) + +So far when a to a remote computer wass required, the transport plugins blocked any further program execution until the communication finished. This has been for long an open potential for speed ups as. With the new asynchronous transport plugin for SSH (`core.async_ssh`) multiple communcations with the remote can happen concurrently. + +##### When `core.ssh_async` outperforms +In scenarios where the worker is blocked by heavy transfer tasks (uploading/downloading/copying large files), `core.ssh_async` shows significant improvement. + +For example, I submitted two WorkGraphs: +1. The first handles heavy transfers: +- Upload 10 MB +- Remote copy 1 GB +- Retrieve 1 GB +2. The second performs a simple shell command: `touch file`. + +The time taken until the submit command is processed (with one worker running): +- **`core.ssh_async`:** **Only 4 seconds!** 🚀🚀🚀🚀 *A major improvement!* +- **`core.ssh`:** **108 seconds** (WorkGraph 1 fully completes before processing the second). + +##### When `core.ssh_async` and `core.ssh` are comparable +For tasks involving both (and many!) uploads and downloads (a common scenario), performance varies slightly depending on the case. + +- **Large files (~1 GB):** +- `core.ssh_async` performs better due to simultaneous uploads and downloads. In some networks, this can almost double the bandwidth, as demonstrated in the graph in on the PR #6626. My bandwidth is 11.8 MB/s but increased to nearly double under favorable conditions. However, under heavy network load, bandwidth may revert to its base level (e.g., 11.8 MB/s) + +**Test case:** Two WorkGraphs: one uploads 1 GB, the other retrieves 1 GB using `RemoteData`. +- `core.ssh_async`: **120 seconds** +- `core.ssh`: **204 seconds** + +- **Small files (Many small transfers):** +- **Test case:** 25 WorkGraphs each transferring a few 1 MB files. +- `core.ssh_async`: **105 seconds** +- `core.ssh`: **65 seconds** + +In this scenario, however, the overhead of asynchronous calls seems to outweigh the benefits. We need to discuss the trade-offs and explore possible optimizations. As @agoscinski mentioned, this might be expected, see here [async overheads](https://stackoverflow.com/questions/55761652/what-is-the-overhead-of-an-asyncio-task). + +#### Serialization of ORM nodes [#6723](https://github.com/aiidateam/aiida-core/pull/6723) +AiiDA's Python API provides an object relational mapper (ORM) that abstracts the various entities that can be stored inside the provenance graph and the relationships between them. In most use cases, users use this ORM directly in Python to construct new instances of entities and retrieve existing ones, in order to get access to their data and manipulate it. A current shortcoming of the ORM is that it is not possible to programmatically introspect the schema of each entity: that is to say, what data each entity stores. This makes it difficult for external applications to provide interfaces to create and or retrieve entity instances. It also makes it difficult to take the data outside of the Python environment since the data would have to be serialized. However, without a well-defined schema, doing this without an ad-hoc solution is practically impossible. + +With the implementation of a Model for each Entity we now allow an external application to programmatically determine the schema of all entities of AiiDA's ORM and automatically (de)serialize entity instances to and from other data formats, e.g., JSON. An example how this is done for an integer node. + +```python +node = Int(5) # Can be any ORM node +serialized_node = node.serialize() +print(serialized_node) +# {'pk': None, 'uuid': '485c2ec8-441d-484d-b7d9-374a3cdd98ae', 'node_type': 'data.core.int.Int.', 'process_type': None, 'repository_metadata': {}, 'ctime': datetime.datetime(2025, 5, 2, 10, 20, 41, 275443, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200), 'CEST')), 'mtime': None, 'label': '', 'description': '', 'attributes': {'value': 5}, 'extras': {}, 'computer': None, 'user': 1, 'repository_content': {}, 'source': None, 'value': 5} +Warning: Serialization through pydantic is still an experimental feature and might break in future releases. +uuid: 77e9c19a-5ecb-40cf-8238-ea5c55fbb83f (unstored) value: 5 +node_deserialized = Int.from_serialized(**serialized_node) +print(node_deserialized) +# uuid: 77e9c19a-5ecb-40cf-8238-ea5c55fbb83f (unstored) value: 5 +``` + +Note that this is still not a stable feature and might break in future releases without deprecation warning. For more information see [AEP 010](https://github.com/aiidateam/AEP/blob/983a645c9285ba65c7cf07fe6064c23e7e994c06/010_orm_schema/readme.md) + +#### Stashing [#6746](https://github.com/aiidateam/aiida-core/pull/6746), [#6772](https://github.com/aiidateam/aiida-core/pull/6772) + +You can now bundle your data to a tar during the stashing (with compression) by specifying one of the `stash_mode`s option `"tar"`, `"tar.bz2"`, `"tar.gz"`, `"tar.xz"`. + +```python +from aiida.plugins import CalculationFactory +from aiida.engine import run +from aiida.common import StashMode +from aiida.orm import Computer + +inputs = { + ..., + 'metadata': { + 'computer': Computer.collection.get(label="localhost"), + 'options': { + 'resources': {'num_machines': 1}, + 'stash': { + 'stash_mode': StashMode.COMPRESS_TARGZ.value, + 'target_base': '/scratch/', + 'source_list': ['heavy_data.xyz'], # ['*'] to stash everything + }, + }, + }, +} +# If you use a builder, use +# builder.metadata = {'options': {...}, ...} + +run(MyCalculation, **inputs) +``` + +Historically, stashing was only possible, if it was instructed before running a generic calcjob. The instruction had to +be "attached" to the original calcjob. However, if a user would realize they need to stash something only after running a calcjob, this would not be possible. + +Now, we introduced a new calcjob `StashCalculation` which is able to perform a stashing operation after a calculation has finished. The usage is very similar, and for consistency and user-friendliness, we keep the instruction as part of the metadata. The only main input is obviously a source node which is the UUID of the `RemoteData` node of the calculation to be stashed, for example: + +```python +from aiida.plugins import CalculationFactory +from aiida.engine import run +from aiida.common import StashMode +from aiida.orm import load_node + +StashCalculation = CalculationFactory('core.stash') + +calcjob_node = load_node() +inputs = { + 'metadata': { + 'computer': calcjob_node.computer, + 'options': { + 'resources': {'num_machines': 1}, + 'stash': { + 'stash_mode': StashMode.COPY.value, + 'target_base': '/scratch/', + 'source_list': ['heavy_data.xyz'], + }, + }, + }, + 'source_node': calcjob_node.outputs.remote_folder, +} + +result = run(StashCalculation, **inputs) +``` + +#### Forcefully killing process [#6793](https://github.com/aiidateam/aiida-core/pull/6793) + +Till this release `verdi process kill` could get stuck when no connection to the remote computer could be established. There is now the `--force-kill` option to kill a process without waiting for a response from remote. Note that this might create orphan jobs on the remote computer. + +``` +verdi process kill --force-kill +``` + +We also now cancel the old kill command if it is resend by the user. This allows the user to adapt the EBM parameters in the verdi config and then resend the kill command with the new parameters. +``` +verdi process kill --timeout 5 +verdi config set transport.task_maximum_attempts 1 +verdi config set transport.task_retry_initial_interval 5 +verdi daemon restart +verdi process kill --wait +``` + +For more information see issue #6524. + +#### Extended dumping support for profiles and groups [#6723](https://github.com/aiidateam/aiida-core/pull/6723) + +We extended dumping feature for profiles and groups. This allows the user to retrieve all data associated to a profile. Accessible through verdi by + +```bash +verdi {profile|group|process} dump +``` + +Note that this is still not a stable feature and might break in future releases without deprecation warning. + + + +#### Miscellaneous +- aiida-core is compatible with Python 3.13 [#6600](https://github.com/aiidateam/aiida-core/pull/6600) +- Improved windows support [#6715](https://github.com/aiidateam/aiida-core/pull/6715) +- RemoteData get_size_on_disk [#6584](https://github.com/aiidateam/aiida-core/pull/6584) +- SinglefileData from_bytes [#6653](https://github.com/aiidateam/aiida-core/pull/6653) +- Allow memory specification [#6605](https://github.com/aiidateam/aiida-core/pull/6605) +- Add filters to verdi group delete. [#6556](https://github.com/aiidateam/aiida-core/pull/6556) +- verdi storage maintain: show a progress bar [#6562](https://github.com/aiidateam/aiida-core/pull/6562) +- New transport endpoint `compress` & `extract` [#6743](https://github.com/aiidateam/aiida-core/pull/6743) +- Implementation of missing sqlite endpoints: + - `get_creation_statistics` [#6763](https://github.com/aiidateam/aiida-core/pull/6763) + - `contains` [#6619](https://github.com/aiidateam/aiida-core/pull/6619) + - `has_key` [#6606](https://github.com/aiidateam/aiida-core/pull/6606) + + +### For users + +#### Features +- CLI: Accept mulitple node identifiers in `verdi node graph generate` (#6443) [[6d2edc919]](https://github.com/aiidateam/aiida-core/commit/6d2edc919e3340b67d8097c425a5e5f6971707f8) +- CLI: Add default for `output_file` in computer and code export commands (#6486) [[9355a9878]](https://github.com/aiidateam/aiida-core/commit/9355a9878134b7c8e3e75bb029c251f0bf0a7357) +- CLI: Validate storage in `verdi storage version` (#6551) [[ad1a431f3]](https://github.com/aiidateam/aiida-core/commit/ad1a431f33a6e57d8b6867447ecdfd8ff41bc8f5) +- CLI: Add filters to verdi group delete. (#6556) [[72a6b183b]](https://github.com/aiidateam/aiida-core/commit/72a6b183b8048d5c31b2b827efe6b8b969038e28) +- CLI: verdi storage maintain: show a progress bar (#6562) [[c7c289d38]](https://github.com/aiidateam/aiida-core/commit/c7c289d3892bf76894714f53f58b7ce5b0761178) +- implement has_key filter for SQLite backend [[779cc29d8]](https://github.com/aiidateam/aiida-core/commit/779cc29d8a47eddabdf9b274d7fa711220ee1aa9) +- ORM: Add `from_bytes` `classmethod` to `orm.SinglefileData` (#6653) [[0f0b88a39]](https://github.com/aiidateam/aiida-core/commit/0f0b88a39ff670fb452678c9b47ef6978e075005) +- ORM: Add `get_size_on_disk` method to `RemoteData` (#6584) [[02cbe0ceb]](https://github.com/aiidateam/aiida-core/commit/02cbe0ceb7d9f9343911592e31abe6310d2bd38b) +- QB: Re-introduction of `contains` Filter Operator for SQLite (#6619) [[aa0aa262a]](https://github.com/aiidateam/aiida-core/commit/aa0aa262ab30e40211c0bd4d2c08a7172c2fd257) +- Transport: `AsyncTransport` plugin (#6626) [[eba6954bf]](https://github.com/aiidateam/aiida-core/commit/eba6954bf97999bbfd07210b33641d3b494ce221) +- Add support for running direct jobs on bash environment on Windows [[ce9dcf421]](https://github.com/aiidateam/aiida-core/commit/ce9dcf421346ae2e6aedc0cdc2ff325d073fcf65) +- `Transport`: feat `compress` & `extract` methods (#6743) [[f4c55f5f7]](https://github.com/aiidateam/aiida-core/commit/f4c55f5f78cd7fde9a5b4a4e48cad2159fd666b2) +- `CLI`: add option `--clean-workdir` to `verdi node delete` (#6756) [[c53592850]](https://github.com/aiidateam/aiida-core/commit/c53592850d7d39a4158ac7c8f25b219c848e9a6c) +- `RemoteStashCompressedData` new data class, and it's deployment to `execmanager.py` to support compressed file formats while stashing. [[ae49af6c3]](https://github.com/aiidateam/aiida-core/commit/ae49af6c304ba155c2e4fc2ea8f69fb8cfcacb4c) +- [QueryBuilder] Implement `get_creation_statistics` for SQLite backend (#6763) [[83454c713]](https://github.com/aiidateam/aiida-core/commit/83454c713cac6e05ac6f0b1c238f576306e87191) +- Support for Python 3.13 (#6600) [[eb34b0606]](https://github.com/aiidateam/aiida-core/commit/eb34b06062f1aad1ab78b12330706d12979b73d2) +- `StashCalculation`: a new `CalcJob` plugin (#6772) [[bc253236d]](https://github.com/aiidateam/aiida-core/commit/bc253236d7ba16b988e6cf34d58287ffb9610ccb) +- ORM: Use pydantic to specify a schema for each ORM entity (#6255) [[958bfd05c]](https://github.com/aiidateam/aiida-core/commit/958bfd05cabe403cd0ccf2832478d397c177685e) +- Add force-kill option when killing a process (#6793) [[b6d0fe50d]](https://github.com/aiidateam/aiida-core/commit/b6d0fe50dd087997a77e4120095fde19db9f92df) +- Align the yes/no prompt in `verdi computer delete` with other prompts [[71fc14f3c2a501ff8d704d20df76a297edc8e8bc]](https://github.com/aiidateam/aiida-core/commit/71fc14f3c2a501ff8d704d20df76a297edc8e8bc) + +#### Fixes +- CLI: Catch `NotImplementedError` in `verdi calcjob gotocomputer` (#6525) [[120c8ac6d]](https://github.com/aiidateam/aiida-core/commit/120c8ac6dcd15cec1ff3260ab65276c60027dd5f) +- Scheduler: Allow a memory specification of zero for the SLURM plugin (#6605) [[0fa958285]](https://github.com/aiidateam/aiida-core/commit/0fa958285bb07ece05b944cbd694aed663b5193a) +- `QueryBuilder`: Fix type bugs for PostgreSQL backend (#6658) [[53be73730]](https://github.com/aiidateam/aiida-core/commit/53be73730aae0d14fef11cf497e60dcd528c5228) +- Fix verdi devel check-undesired-imports when tui extra is installed (#6693) [[8039ad914]](https://github.com/aiidateam/aiida-core/commit/8039ad9147d2a4bc61bc86845a1c82c2fd484eae) +- `Transport`: Bug fix in `rename` method (#6735) [[f56fcc31c]](https://github.com/aiidateam/aiida-core/commit/f56fcc31c70e3bd6a1422eb6cfee3f5fab5eaac4) +- Cover tests that may randomly fail because DB racing in xdist test (#6713) [[4d374f465]](https://github.com/aiidateam/aiida-core/commit/4d374f465808fe64d30ef89679b820169ffd3f74) +- 'Storage': `sqlite_zip` add a filter to `tar.extractall` method to be compatible with python 3.12 (#6770) [[b95fd2189]](https://github.com/aiidateam/aiida-core/commit/b95fd21897aa83c3da25e1c18381b5613904115f) +- Post release: update version number after v2.6.3 release (#6797) [[0b2222e2b]](https://github.com/aiidateam/aiida-core/commit/0b2222e2bceba6acad21840ae69cfea50a542a9d) +- 👌 Align behavior of `puttree` method for nested folders [[834b2942e]](https://github.com/aiidateam/aiida-core/commit/834b2942eebfbf56380b900f03e94d161a5fd161) +- 🐛 Fix `local_copy_list` behavior for nested target folder [[9fe8d5090]](https://github.com/aiidateam/aiida-core/commit/9fe8d5090cbe4fbfbe34558a04cca32ecb65422f) +- Fix the pydantic model for `RemoteData` (#6845) [[936185b7f]](https://github.com/aiidateam/aiida-core/commit/936185b7f3f61c40bd5f1579f7921ea788233af1) +- Update/add models for `RemoteData` and `RemoteStashCompressedData` (#6844) [[2fd4b8931d4abd7799f6c17aec21612a7f85d1b3]](https://github.com/aiidateam/aiida-core/commit/2fd4b8931d4abd7799f6c17aec21612a7f85d1b3) + +### Documentation +- Docs: Add overview of common `core` plugins (#6654) [[baf8d7c3e]](https://github.com/aiidateam/aiida-core/commit/baf8d7c3efd18c0e3337ea5a6b09631fa449220c) +- Docs: add google-site-verification meta tag (#6792) [[660fec70e]](https://github.com/aiidateam/aiida-core/commit/660fec70ef43a64be7edae3c12f0a0fd5ef84349) +- Docs: Enhancements to Installation, Introduction, and Tutorial Sections (#6806) [[ee1cc9fb8]](https://github.com/aiidateam/aiida-core/commit/ee1cc9fb820fc34f1996b6ea338014cefdc69e36) +- Docs: Fix the logic in FizzBuzz WorkChain example (#6812) [[cfd2052a4]](https://github.com/aiidateam/aiida-core/commit/cfd2052a42c977e43699f8ff429d406b15c9740a) + +### For developers + +#### Source code +- Add the `SshAutoTransport` transport plugin (#6154) [[71422eb87]](https://github.com/aiidateam/aiida-core/commit/71422eb872040a9ba23047d2ec031f6deaa6a7cc) +- Dependencies: Update requirements for `kiwipy` and `plumpy` [[d86017f42]](https://github.com/aiidateam/aiida-core/commit/d86017f42cb5359d0272694247756d547057a663) +- `Manager`: Catch `TimeoutError` when closing communicator [[e91371573]](https://github.com/aiidateam/aiida-core/commit/e91371573a84d4a68d6107f33c392b8718f2f26f) +- `SqliteDosStorage`: Make the migrator compatible with SQLite (#6429) [[6196dcd3b]](https://github.com/aiidateam/aiida-core/commit/6196dcd3b321758ae8dfb84b22a59e1c77d8e933) +- Dependencies: Update requirement to `psycopg~=3.0` (#6362) [[cba6e7c75]](https://github.com/aiidateam/aiida-core/commit/cba6e7c757ec74194afc63809b5dac72bb81a771) +- `Scheduler`: Refactor interface to make it more generic (#6043) [[954cbdd3e]](https://github.com/aiidateam/aiida-core/commit/954cbdd3ee5127d6618db9d144508505e41cffcc) +- Post release: add the `.post0` qualifier to version attribute [[16b8fe4a0]](https://github.com/aiidateam/aiida-core/commit/16b8fe4a0912ac36973fb82f14691723e72599a7) +- Post release: update version number and CHANGELOG after v2.6.2 release [[fb3686271]](https://github.com/aiidateam/aiida-core/commit/fb3686271fcdeb5506838a5a3069955546b05460) +- Dependencies: Update requirement `paramiko~=3.0` (#6559) [[c52ec6758]](https://github.com/aiidateam/aiida-core/commit/c52ec6758a0d5c5191e4099cabbbd1a7314284ed) +- Refactor: Add the `_prepare_yaml` method to `AbstractCode` (#6565) [[98ffc331d]](https://github.com/aiidateam/aiida-core/commit/98ffc331d154cc7717861fde6c908ec510003926) +- Dependencies: version check on `sqlite` C-language (#6567) [[d0c9572c8]](https://github.com/aiidateam/aiida-core/commit/d0c9572c83aa953f1490f35c1110f5815dc15ac3) +- Add `verdi devel launch-multiply-add` [[b7c82867b]](https://github.com/aiidateam/aiida-core/commit/b7c82867b5cca08f1ee77bda31bf11d77e96b548) +- Add `aiida_profile_clean` to `test_devel.py` [[2ed19dcd8]](https://github.com/aiidateam/aiida-core/commit/2ed19dcd824e1b9a5003dae93613a56ecca7c3a7) +- `Transport` & `Engine`: factor out `getcwd()` & `chdir()` for compatibility with upcoming async transport (#6594) [[6f5c35ed1]](https://github.com/aiidateam/aiida-core/commit/6f5c35ed16beaf585e2c3045cab5ed4fb0ef3bc1) +- Changes required make it possible to run with pytest-xdist (#6631) [[1c5f10968]](https://github.com/aiidateam/aiida-core/commit/1c5f10968f40934b81eef72ed72dc814497707a8) +- Update changelog for release v2.6.3 (#6637) [[54c4a0d06]](https://github.com/aiidateam/aiida-core/commit/54c4a0d06660f522717d87ffae1ca84211d8c79e) +- Typing: mypy fix for `orm.List` the `pop` and `index` methods (#6635) [[36eab7797]](https://github.com/aiidateam/aiida-core/commit/36eab779793a6dd0184c83f0b0af6a75a62fac82) +- Use only one global var for marking config folder tree (#6610) [[9baf3ca96]](https://github.com/aiidateam/aiida-core/commit/9baf3ca96caa5577ec8ed6cef69512f430bb5675) +- Make `load_profile` and methods in aiida.__init__ importable from aiida module (#6609) [[ec52f4ef3]](https://github.com/aiidateam/aiida-core/commit/ec52f4ef321f9ef1fa24e5a3056153ea55bce7d4) +- Adapt message arguments passing to process controller (#6668) [[80c1741ca]](https://github.com/aiidateam/aiida-core/commit/80c1741ca3df607c75254a61fd664018dc043219) +- Disable `apparent-size` in `du` command of `get_size_on_disk` (#6702) [[2da3f9600]](https://github.com/aiidateam/aiida-core/commit/2da3f9600aa69e90977370377b1bf81edd9856a1) +- Engine: Async run (#6708) [[d71ef9810]](https://github.com/aiidateam/aiida-core/commit/d71ef98100a0d0b5ff2e71d078cde4303e1cdd1b) +- ORM: Use `skip_orm` as the default implementation for `SqlaGroup.add_nodes` and `SqlaGroup.remove_nodes` (#6720) [[d2fbf214a]](https://github.com/aiidateam/aiida-core/commit/d2fbf214ad2fcfe5e39f9ebe2982f05557196397) +- A new common funcion:`assert_never` to assert certain part of the code is never reached. [[d7c382a24]](https://github.com/aiidateam/aiida-core/commit/d7c382a2447260320a36be4f5c6de285f9d37bdc) +- Replace usage of aiida.common.utils.strip_prefix with built-in removeprefix (#6758) [[290045b17]](https://github.com/aiidateam/aiida-core/commit/290045b171626a94f717e1e066e3cced50dab5a3) +- Revert "Add the `SshAutoTransport` transport plugin (#6154)" (#6852) [[cf2614fa2]](https://github.com/aiidateam/aiida-core/commit/cf2614fa26ce1e8b15656a659a9f3b9fcec88b55) +- Transport: Three bug fixed in `test_execmanager` and `AsyncSshTransport` (#6855) [[474e0fabcb4e2331253e10c1fbcd8ba6a323f6d2]](https://github.com/aiidateam/aiida-core/commit/474e0fabcb4e2331253e10c1fbcd8ba6a323f6d2) +- Update package metadata with suppport of py3.13 (#6863) [[e257b3cb2f0e7a088d2ac4ebfb463492beea321d]](https://github.com/aiidateam/aiida-core/commit/e257b3cb2f0e7a088d2ac4ebfb463492beea321d) + +#### Tests +- Tests: Remove test for `make_aware` using fixed date [[9fe7672f3]](https://github.com/aiidateam/aiida-core/commit/9fe7672f36c706c7eca87a4807012a1c8a5c0259) +- Devops: Change tempfile to pytest's tmp_path [[309352f8b]](https://github.com/aiidateam/aiida-core/commit/309352f8bf29e52057bd31153858eb8a3560e704) +- Devops: Determine command directory dynamically with `which` [[d3e9333f5]](https://github.com/aiidateam/aiida-core/commit/d3e9333f517ce833b8ce288652007c10e0b99f9f) +- Tests: add --db-backend pytest option (#6625) [[a863d1e88]](https://github.com/aiidateam/aiida-core/commit/a863d1e887822334df4db9120421cd38ded04d3e) +- Test refactoring: use tmp path fixture to mock remote and local for transport plugins (#6627) [[197c666d3]](https://github.com/aiidateam/aiida-core/commit/197c666d362b3a7dd03bec9ccc1e41bb44023e7c) +- Set pytest timeout to 60s for every test (#6674) [[17ab39519]](https://github.com/aiidateam/aiida-core/commit/17ab3951956c54de29d328809d54f88d89bd66ba) +- Timeout for single pytest to 240s (#6692) [[8ae66fb66]](https://github.com/aiidateam/aiida-core/commit/8ae66fb66a4fdc876b9f429a09072f41c4ed48bd) +- Clean profile in orm/test_codes.py::test_input_code [[3f5e2c132]](https://github.com/aiidateam/aiida-core/commit/3f5e2c132554e58c868660b3ee874661925366ca) +- Only emit path to daemon log path in pytest tmp folder (#6698) [[7a460c0fd]](https://github.com/aiidateam/aiida-core/commit/7a460c0fd45ffd8da790807b3ea214e6fab01c5b) + +#### Devops +- Devops: Update pre-commit dependencies (#6504) [[14bb05f4b]](https://github.com/aiidateam/aiida-core/commit/14bb05f4b4e7fbda86682ea2cf4e3881b3a3e8dc) +- Docker: Fix release tag in publish workflow (#6520) [[c740b99f2]](https://github.com/aiidateam/aiida-core/commit/c740b99f2bfe366a733f140164a21048cd51198e) +- Devops: Mark `test_leak_ssh_calcjob` as nightly (#6521) [[a5da4eda1]](https://github.com/aiidateam/aiida-core/commit/a5da4eda131f844c3639bdb01a256b9e9a7873a2) +- Devops: Add type hints to `aiida.orm.utils.remote` (#6503) [[2bdcb7f00]](https://github.com/aiidateam/aiida-core/commit/2bdcb7f00dac93b3287baef042f873cd5f6ee247) +- Docker: Make warning test insensitive to deprecation warnings (#6541) [[f1be224c4]](https://github.com/aiidateam/aiida-core/commit/f1be224c4680407984eda8652692ec0ea708a3e1) +- CI: Update ignore comment as the way that presumably updated mypy expects (#6566) [[655da5acc]](https://github.com/aiidateam/aiida-core/commit/655da5acc183ef81120f5d77f1fdc760e186c64c) +- Update the issue template with the Discourse group (#6588) [[a7d8dd04e]](https://github.com/aiidateam/aiida-core/commit/a7d8dd04ea63bb362e9414a8bd55554f30424bf1) +- Fix broken troubleshooting link in bug report issue template (#6589) [[f57591655]](https://github.com/aiidateam/aiida-core/commit/f57591655a8f91d1b37bd0f46e8c6dc6a9566c2c) +- Devops: Add tox environment for pytest with presto mark [[e2699118e]](https://github.com/aiidateam/aiida-core/commit/e2699118ea880c3619f0985defec7c5ad09b926e) +- CLI: Dump only `sealed` process nodes (#6591) [[70572380b]](https://github.com/aiidateam/aiida-core/commit/70572380bf05998f27ea54df3653ec357e44fc69) +- CLI: Check user execute in `verdi code test` for `InstalledCode` (#6597) [[8350df0cb]](https://github.com/aiidateam/aiida-core/commit/8350df0cb0c48588899d732feb26ce7e43903173) +- Devops: Bump `peter-evans/create-pull-request` (#6576) [[dd866ce81]](https://github.com/aiidateam/aiida-core/commit/dd866ce816e986285f2c5794f431b6e3c68a369b) +- Bump mypy version to ~=1.13.0 (#6630) [[c93fb4f75]](https://github.com/aiidateam/aiida-core/commit/c93fb4f75554802e46fdcb7cf8caf27318ad04d0) +- CI: remove ci-style.yml (#6638) [[451375b7c]](https://github.com/aiidateam/aiida-core/commit/451375b7cfa08c7a373654f4ac64d6e1204bd865) +- Pre-commit: exclude mypy for all tests (#6639) [[846c11f8c]](https://github.com/aiidateam/aiida-core/commit/846c11f8c323c60a8e17eb715980e114cc9e6363) +- Add helper script for creating patch releases from a list of commits (#6602) [[ec8a055a5]](https://github.com/aiidateam/aiida-core/commit/ec8a055a533b6422ed95b4ec30faf70efd667761) +- CLI: Handle `None` process states in build_call_graph (#6590) [[f74adb94c]](https://github.com/aiidateam/aiida-core/commit/f74adb94cc1e8439c8076f563ec112466fdd174b) +- Bump ruff version (#6614) [[c915a9734]](https://github.com/aiidateam/aiida-core/commit/c915a97348ffb344ab48fe031711120238f6976a) +- DevOp: Using xdist to run pytest in parallel (#6620) [[090dc1c73]](https://github.com/aiidateam/aiida-core/commit/090dc1c7300ab9d25ee75498d040ecf1cd3bb1d7) +- Bump codecov/codecov-action from 4 to 5 in the gha-dependencies group (#6648) [[835d13b73]](https://github.com/aiidateam/aiida-core/commit/835d13b735e068883ed414755717b0fc366642b0) +- Amend type call error after using AiiDAConfigDir (#6646) [[dbdc36c63]](https://github.com/aiidateam/aiida-core/commit/dbdc36c635ae3596905ab54f0905e97026b85f49) +- CI: Turn off verbose pytest output (#6633) [[41a0fd92b]](https://github.com/aiidateam/aiida-core/commit/41a0fd92bf6233a758b10a785534f6186b567e16) +- Bump ruff to v0.8.0 (#6634) [[333992be5]](https://github.com/aiidateam/aiida-core/commit/333992be53d2e9a54ce116470a7a5534b2a28f07) +- CI: Utilize uv lockfile for reproducible test environments (#6640) [[04cc34488]](https://github.com/aiidateam/aiida-core/commit/04cc34488220de645289f23f126501f63beabbd8) +- CI: Test with RabbitMQ 4.0.x (#6649) [[a1872b1e7]](https://github.com/aiidateam/aiida-core/commit/a1872b1e7f99f8673211bde10cbbc68a1d697ad5) +- CI: quick fix on failed benchmark CI using uv run pytest (#6652) [[37e5431e6]](https://github.com/aiidateam/aiida-core/commit/37e5431e6802aae5de4a5079af2fcdc1b6a2ff66) +- Typing-extensions as dependency for py3.9 (#6664) [[c532b34a1]](https://github.com/aiidateam/aiida-core/commit/c532b34a199f86d2e117cfbfa47affd4afe9d28b) +- Update uv.lock + CI tweaks + fix pymatgen issue (#6676) [[10930266f]](https://github.com/aiidateam/aiida-core/commit/10930266fb6c644a7467a41268fe27f4bc40ea13) +- Devops: Update pre-commit dependencies (#6683) [[0eb77b8ae]](https://github.com/aiidateam/aiida-core/commit/0eb77b8ae5e54716d7c9f4d22aac880939285198) +- CI: Add matrix testing for both SQLite and PostgreSQL database backends [[a3ccec96d]](https://github.com/aiidateam/aiida-core/commit/a3ccec96d894ea7ce79dee4c14914ce4ebb869c6) +- Fix redundant "tests/" in test-install.yml (#6695) [[5e8bbe1ae]](https://github.com/aiidateam/aiida-core/commit/5e8bbe1ae08ad5b8ae3d25a70be0a059c9ce260f) +- CI: Drop workaround to pass pymatgen related failed tests (#6694) [[d17c2931a]](https://github.com/aiidateam/aiida-core/commit/d17c2931a96782ec89704d656dd41bcfd671410a) +- Devops: Add explicit `sphinx.configuration` key to RTD conf (#6700) [[8440416a5]](https://github.com/aiidateam/aiida-core/commit/8440416a58d901e0030c7088c41be8ed94922594) +- Use uv lockfile in readthedocs build (#6685) [[15b5caf23]](https://github.com/aiidateam/aiida-core/commit/15b5caf239cefce144386ec846edff0546b0637a) +- Run mypy on src/aiida/orm/nodes/caching.py (#6703) [[199a0276c]](https://github.com/aiidateam/aiida-core/commit/199a0276c4becfa23f64014b4f426a0c317068fe) +- CI: Set runners to ubuntu-24.04 (#6696) [[b43261143]](https://github.com/aiidateam/aiida-core/commit/b43261143a136a58afddcfa1438036c99b39f8b7) +- CI: fix failed test_containerized.py integration test for containerized code (#6707) [[738705611]](https://github.com/aiidateam/aiida-core/commit/738705611bf18cdd2bf337531268a8ec7465322a) +- Nightly tests adjust run test_memory_leak in test move high_link to nightly (#6701) [[b3569508c]](https://github.com/aiidateam/aiida-core/commit/b3569508c8edcc1aa2b1126481ab60d9133f0189) +- Set minimum uv version in pyproject.toml (#6714) [[c88fc05ba]](https://github.com/aiidateam/aiida-core/commit/c88fc05ba111f82e089553e5413b2c7c4d482f6f) +- Use uv-pre-commit to validate lockfile (#6699) [[208d6a967]](https://github.com/aiidateam/aiida-core/commit/208d6a967203dcaa4b685f336e413491cdceb7a2) +- Use Github arm runners for docker arm build tests (#6717) [[f43a51010]](https://github.com/aiidateam/aiida-core/commit/f43a51010b3c84b1e9526d167401fc9ac07c556a) +- CI: Ignore mamba Warning Message on stderr to Prevent JSON Parsing Errors (#6748) [[8c5c709be]](https://github.com/aiidateam/aiida-core/commit/8c5c709bece13c56af1d91384957e7ccb8dd320b) +- Fix docker arm build (#6759) [[c4dfadabf]](https://github.com/aiidateam/aiida-core/commit/c4dfadabfa3183118bbbc416307529f50ebc9fd0) +- `CI`: `RTD` tail errors and warnings (#6776) [[bb5f93daa]](https://github.com/aiidateam/aiida-core/commit/bb5f93daaf50ee48e18856000522c2da757b022e) +- Bump the gha-dependencies group with 2 updates (#6778) [[48bd2acbc]](https://github.com/aiidateam/aiida-core/commit/48bd2acbcd6f3dd1e6b4f7b46abea5451d6a96c9) +- Revert "Bump the gha-dependencies group with 2 updates (#6778)" (#6838) [[61be15d54]](https://github.com/aiidateam/aiida-core/commit/61be15d548eab6001a43127eb851aa766d514067) +- Add Python 3.13 to `tests` job in `test-install` workflow and for verdi and presto jobs in `ci-code` workflow (#6843) [[6cb2c712d]](https://github.com/aiidateam/aiida-core/commit/6cb2c712dcc62e6429f990b9e4f29d84036ca69e) +- Add back pre-commit job for mypy type-checking (#6827) [[6eb17a7d0]](https://github.com/aiidateam/aiida-core/commit/6eb17a7d0999adb2bf7ba2449b2667049996a65b) +- RTD: add a developer comment in `.readthedocs.yml` (#6864) [[5807a390927a5892cd349bf8e3f3da5ef233eca5]](https://github.com/aiidateam/aiida-core/commit/5807a390927a5892cd349bf8e3f3da5ef233eca5) + + ## v2.6.3 - 2024-11-6 ### Fixes diff --git a/src/aiida/__init__.py b/src/aiida/__init__.py index 752e680ef0..62eee234d1 100644 --- a/src/aiida/__init__.py +++ b/src/aiida/__init__.py @@ -27,7 +27,7 @@ 'For further information please visit http://www.aiida.net/. All rights reserved.' ) __license__ = 'MIT license, see LICENSE.txt file.' -__version__ = '2.6.3.post0' +__version__ = '2.7.0pre1' __authors__ = 'The AiiDA team.' __paper__ = ( 'S. P. Huber et al., "AiiDA 1.0, a scalable computational infrastructure for automated reproducible workflows and '