Added entry point to request for align-and-merge job for CLEM workflow and register its results (#411)

tieneupin · web-flow · commit 67854bb35ecb · 2024-11-29T17:24:05.000Z
* Added entry point to request for align-and-merge processing job for CLEM workflow
* Added entry point to register CLEM align-and-merge results
* Moved CLEM database functions to murfey.workflows.clem.__init__
* Standardised CLEM workflow names
diff --git a/pyproject.toml b/pyproject.toml
@@ -98,10 +98,12 @@ murfey = "murfey.client:run"
 [project.entry-points."murfey.config.extraction"]
 "murfey_machine" = "murfey.util.config:get_extended_machine_config"
 [project.entry-points."murfey.workflows"]
-"process_raw_lifs" = "murfey.workflows.clem.process_raw_lifs:zocalo_cluster_request"
-"process_raw_tiffs" = "murfey.workflows.clem.process_raw_tiffs:zocalo_cluster_request"
-"register_lif_preprocessing_result" = "murfey.workflows.clem.register_preprocessing_results:register_lif_preprocessing_result"
-"register_tiff_preprocessing_result" = "murfey.workflows.clem.register_preprocessing_results:register_tiff_preprocessing_result"
+"clem.align_and_merge" = "murfey.workflows.clem.align_and_merge:submit_cluster_request"
+"clem.process_raw_lifs" = "murfey.workflows.clem.process_raw_lifs:zocalo_cluster_request"
+"clem.process_raw_tiffs" = "murfey.workflows.clem.process_raw_tiffs:zocalo_cluster_request"
+"clem.register_align_and_merge_result" = "murfey.workflows.clem.register_align_and_merge_results:register_align_and_merge_result"
+"clem.register_lif_preprocessing_result" = "murfey.workflows.clem.register_preprocessing_results:register_lif_preprocessing_result"
+"clem.register_tiff_preprocessing_result" = "murfey.workflows.clem.register_preprocessing_results:register_tiff_preprocessing_result"
 
 [tool.setuptools]
 package-dir = {"" = "src"}
diff --git a/src/murfey/server/api/clem.py b/src/murfey/server/api/clem.py
@@ -629,7 +629,9 @@ def process_raw_lifs(
     try:
         # Try and load relevant Murfey workflow
         workflow: EntryPoint = list(
-            entry_points().select(group="murfey.workflows", name="process_raw_lifs")
+            entry_points().select(
+                group="murfey.workflows", name="clem.process_raw_lifs"
+            )
         )[0]
     except IndexError:
         raise RuntimeError("The relevant Murfey workflow was not found")
@@ -661,7 +663,9 @@ def process_raw_tiffs(
     try:
         # Try and load relevant Murfey workflow
         workflow: EntryPoint = list(
-            entry_points().select(group="murfey.workflows", name="process_raw_tiffs")
+            entry_points().select(
+                group="murfey.workflows", name="clem.process_raw_tiffs"
+            )
         )[0]
     except IndexError:
         raise RuntimeError("The relevant Murfey workflow was not found")
diff --git a/src/murfey/util/models.py b/src/murfey/util/models.py
@@ -1,11 +1,10 @@
 from __future__ import annotations
 
-from ast import literal_eval
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
-from pydantic import BaseModel, validator
+from pydantic import BaseModel
 
 """
 General Models
@@ -161,40 +160,6 @@ class TIFFSeriesInfo(BaseModel):
     series_metadata: Path
 
 
-class LIFPreprocessingResult(BaseModel):
-    image_stack: Path
-    metadata: Path
-    series_name: str
-    channel: str
-    number_of_members: int
-    parent_lif: Path
-
-
-class TIFFPreprocessingResult(BaseModel):
-    image_stack: Path
-    metadata: Path
-    series_name: str
-    channel: str
-    number_of_members: int
-    parent_tiffs: list[Path]
-
-    @validator(
-        "parent_tiffs",
-        pre=True,
-    )
-    def parse_stringified_list(cls, value):
-        if isinstance(value, str):
-            try:
-                eval_result = literal_eval(value)
-                if isinstance(eval_result, list):
-                    parent_tiffs = [Path(p) for p in eval_result]
-                    return parent_tiffs
-            except (SyntaxError, ValueError):
-                raise ValueError("Unable to parse input")
-        # Return value as-is; if it fails, it fails
-        return value
-
-
 """
 FIB
 ===
diff --git a/src/murfey/workflows/clem/__init__.py b/src/murfey/workflows/clem/__init__.py
@@ -0,0 +1,187 @@
+from __future__ import annotations
+
+import logging
+import re
+from pathlib import Path
+from typing import Optional, Type, Union
+
+from sqlalchemy.exc import NoResultFound
+from sqlmodel import Session, select
+
+from murfey.util.config import get_machine_config
+from murfey.util.db import (
+    CLEMImageMetadata,
+    CLEMImageSeries,
+    CLEMImageStack,
+    CLEMLIFFile,
+    CLEMTIFFFile,
+)
+from murfey.util.db import Session as MurfeySession
+
+logger = logging.getLogger("murfey.workflows.clem")
+
+
+"""
+HELPER FUNCTIONS FOR CLEM DATABASE
+"""
+
+
+def _validate_and_sanitise(
+    file: Path,
+    session_id: int,
+    db: Session,
+) -> Path:
+    """
+    Performs validation and sanitisation on the incoming file paths, ensuring that
+    no forbidden characters are present and that the the path points only to allowed
+    sections of the file server.
+
+    Returns the file path as a sanitised string that can be converted into a Path
+    object again.
+
+    NOTE: Due to the instrument name query, 'db' now needs to be passed as an
+    explicit variable to this function from within a FastAPI endpoint, as using the
+    instance that was imported directly won't load it in the correct state.
+    """
+
+    valid_file_types = (
+        ".lif",
+        ".tif",
+        ".tiff",
+        ".xlif",
+        ".xml",
+    )
+
+    # Resolve symlinks and directory changes to get full file path
+    full_path = Path(file).resolve()
+
+    # Use machine configuration to validate which file base paths are accepted from
+    instrument_name = (
+        db.exec(select(MurfeySession).where(MurfeySession.id == session_id))
+        .one()
+        .instrument_name
+    )
+    machine_config = get_machine_config(instrument_name=instrument_name)[
+        instrument_name
+    ]
+    rsync_basepath = machine_config.rsync_basepath
+    try:
+        base_path = list(rsync_basepath.parents)[-2].as_posix()
+    except IndexError:
+        logger.warning(f"Base path {rsync_basepath!r} is too short")
+        base_path = rsync_basepath.as_posix()
+    except Exception as e:
+        raise Exception(
+            f"Unexpected exception encountered when loading the file base path: {e}"
+        )
+
+    # Check that full file path doesn't contain unallowed characters
+    # Currently allows only:
+    # - words (alphanumerics and "_"; \w),
+    # - spaces (\s),
+    # - periods,
+    # - dashes,
+    # - forward slashes ("/")
+    if bool(re.fullmatch(r"^[\w\s\.\-/]+$", str(full_path))) is False:
+        raise ValueError(f"Unallowed characters present in {file}")
+
+    # Check that it's not accessing somehwere it's not allowed
+    if not str(full_path).startswith(str(base_path)):
+        raise ValueError(f"{file} points to a directory that is not permitted")
+
+    # Check that it's a file, not a directory
+    if full_path.is_file() is False:
+        raise ValueError(f"{file} is not a file")
+
+    # Check that it is of a permitted file type
+    if f"{full_path.suffix}" not in valid_file_types:
+        raise ValueError(f"{full_path.suffix} is not a permitted file format")
+
+    return full_path
+
+
+def get_db_entry(
+    db: Session,
+    # With the database search funcion having been moved out of the FastAPI
+    # endpoint, the database now has to be explicitly passed within the FastAPI
+    # endpoint function in order for it to be loaded in the correct state.
+    table: Type[
+        Union[
+            CLEMImageMetadata,
+            CLEMImageSeries,
+            CLEMImageStack,
+            CLEMLIFFile,
+            CLEMTIFFFile,
+        ]
+    ],
+    session_id: int,
+    file_path: Optional[Path] = None,
+    series_name: Optional[str] = None,
+) -> Union[
+    CLEMImageMetadata,
+    CLEMImageSeries,
+    CLEMImageStack,
+    CLEMLIFFile,
+    CLEMTIFFFile,
+]:
+    """
+    Searches the CLEM workflow-related tables in the Murfey database for an entry that
+    matches the file path or series name within a given session. Returns the entry if
+    a match is found, otherwise register it as a new entry in the database.
+    """
+
+    # Validate that parameters are provided correctly
+    if file_path is None and series_name is None:
+        raise ValueError(
+            "One of either 'file_path' or 'series_name' has to be provided"
+        )
+    if file_path is not None and series_name is not None:
+        raise ValueError("Only one of 'file_path' or 'series_name' should be provided")
+
+    # Validate file path if provided
+    if file_path is not None:
+        try:
+            file_path = _validate_and_sanitise(file_path, session_id, db)
+        except Exception:
+            raise Exception
+
+    # Validate series name to use
+    if series_name is not None:
+        if bool(re.fullmatch(r"^[\w\s\.\-/]+$", series_name)) is False:
+            raise ValueError("One or more characters in the string are not permitted")
+
+    # Return database entry if it exists
+    try:
+        db_entry = (
+            db.exec(
+                select(table)
+                .where(table.session_id == session_id)
+                .where(table.file_path == str(file_path))
+            ).one()
+            if file_path is not None
+            else db.exec(
+                select(table)
+                .where(table.session_id == session_id)
+                .where(table.series_name == series_name)
+            ).one()
+        )
+    # Create and register new entry if not present
+    except NoResultFound:
+        db_entry = (
+            table(
+                file_path=str(file_path),
+                session_id=session_id,
+            )
+            if file_path is not None
+            else table(
+                series_name=series_name,
+                session_id=session_id,
+            )
+        )
+        db.add(db_entry)
+        db.commit()
+        db.refresh(db_entry)
+    except Exception:
+        raise Exception
+
+    return db_entry
diff --git a/src/murfey/workflows/clem/align_and_merge.py b/src/murfey/workflows/clem/align_and_merge.py
@@ -0,0 +1,79 @@
+"""
+Script to allow Murfey to request for an image alignment, colorisation, and merge job
+from cryoemservices.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Literal, Optional
+
+from murfey.util.config import get_machine_config
+
+try:
+    from murfey.server.ispyb import TransportManager  # Session
+except AttributeError:
+    pass  # Ignore if ISPyB credentials environment variable not set
+
+
+def submit_cluster_request(
+    # Session parameters
+    session_id: int,
+    instrument_name: str,
+    # Processing parameters
+    series_name: str,
+    images: list[Path],
+    metadata: Path,
+    # Optional processing parameters
+    align_self: Optional[str] = None,
+    flatten: Optional[Literal["min", "max", "mean"]] = "mean",
+    align_across: Optional[str] = None,
+    # Optional session parameters
+    messenger: Optional[TransportManager] = None,
+):
+    if not messenger:
+        raise Exception("Unable to find transport manager")
+
+    # Load feedback queue
+    machine_config = get_machine_config()[instrument_name]
+    feedback_queue: str = machine_config.feedback_queue
+
+    # Work out session directory from file path
+    processed_folder = machine_config.processed_directory_name
+    if not images:
+        raise ValueError(f"No image files have been provided for {series_name!r}")
+    reference_file = images[0]
+    path_parts = list(reference_file.parts)
+    path_parts[0] = "" if path_parts[0] == "/" else path_parts[0]
+    try:
+        root_index = path_parts.index(processed_folder)
+    except ValueError:
+        raise ValueError(
+            f"The processed directory {processed_folder!r} could not be found in the "
+            f"file path for {str(reference_file)!r}"
+        )
+    session_dir = Path("/".join(path_parts[:root_index]))
+
+    # Submit message to cryoemservices
+    messenger.send(
+        "processing_recipe",
+        {
+            "recipes": ["clem-align-and-merge"],
+            "parameters": {
+                # Job parameters
+                "series_name": series_name,
+                "images": [str(file) for file in images],
+                "metadata": str(metadata),
+                "align_self": align_self,
+                "flatten": flatten,
+                "align_across": align_across,
+                # Other recipe parameters
+                "session_dir": str(session_dir),
+                "session_id": session_id,
+                "job_name": series_name,
+                "feedback_queue": feedback_queue,
+            },
+        },
+        new_connection=True,
+    )
+    return True
diff --git a/src/murfey/workflows/clem/register_align_and_merge_results.py b/src/murfey/workflows/clem/register_align_and_merge_results.py
diff --git a/src/murfey/workflows/clem/register_preprocessing_results.py b/src/murfey/workflows/clem/register_preprocessing_results.py