openproblems-bio
diff --git a/‎scripts/create_resources/spatial/process_bruker_cosmx.sh‎
Lines changed: 39 additions & 0 deletions b/‎scripts/create_resources/spatial/process_bruker_cosmx.sh‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎src/datasets/loaders/bruker_cosmx/config.vsh.yaml‎
Lines changed: 79 additions & 0 deletions b/‎src/datasets/loaders/bruker_cosmx/config.vsh.yaml‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎src/datasets/loaders/bruker_cosmx/script.py‎
Lines changed: 251 additions & 0 deletions b/‎src/datasets/loaders/bruker_cosmx/script.py‎
Lines changed: 251 additions & 0 deletions
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# Get the root of the repository
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# Ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+publish_dir="s3://openproblems-data/resources/datasets"
+
+cat > /tmp/params.yaml << HERE
+param_list:
+
+  - id: "bruker_cosmx/bruker_mouse_brain_cosmx/rep1"
+    input_raw: "https://smi-public.objects.liquidweb.services/HalfBrain.zip"
+    input_flat_files: "https://smi-public.objects.liquidweb.services/Half%20%20Brain%20simple%20%20files%20.zip"
+    dataset_name: "Bruker CosMx Mouse Brain"
+    dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/cosmx-smi-mouse-brain-ffpe-dataset/"
+    dataset_summary: "Bruker CosMx Mouse Brain dataset on FFPE covering a full hemisphere of a mouse brain."
+    dataset_description: "Bruker CosMx Mouse Brain dataset on FFPE covering a full hemisphere of a mouse brain."
+    dataset_organism: "mus_musculus"
+    segmentation_id: ["cell"]
+
+
+output_dataset: "\$id/dataset.zarr"
+output_state: "\$id/state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
+  --revision build/main \
+  --pull-latest \
+  --main-script target/nextflow/datasets/workflows/process_bruker_cosmx/main.nf \
+  --workspace 53907369739130 \
+  --params-file /tmp/params.yaml \
+  --config common/nextflow_helpers/labels_tw.config \
+  --labels datasets,bruker_cosmx
@@ -0,0 +1,79 @@
+name: bruker_cosmx
+namespace: datasets/loaders
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - type: file
+        name: --input_raw
+        example: "https://smi-public.objects.liquidweb.services/HalfBrain.zip"
+        description: "Download file url for the raw data"
+      - type: file
+        name: --input_flat_files
+        example: "https://smi-public.objects.liquidweb.services/Half%20%20Brain%20simple%20%20files%20.zip"
+        description: "Download file url for the flat files"
+      - type: string
+        name: --segmentation_id
+        default: ["cell"]
+        description: The segmentation identifier
+        multiple: true
+  - name: Metadata
+    arguments:
+      - type: string
+        name: --dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - type: string
+        name: --dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+  - name: Outputs
+    arguments:
+      - name: "--output"
+        __merge__: /src/api/file_common_ist.yaml
+        direction: output
+        required: true
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    __merge__: 
+      - /src/base/setup_spatialdata_partial.yaml
+    setup:
+      - type: python 
+        pypi:
+          - sopa
+      #NOTE: Changeed to sopa releases since work of https://github.com/gustaveroussy/sopa/issues/285 is merged and released
+      #- type: python
+      #  github: [gustaveroussy/sopa@cosmx_labels]
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [highmem, midcpu, midtime]
@@ -0,0 +1,251 @@
+
+
+"""
+Notes about supported data structure:
+
+The directory structure that we need, looks as follows (expected by the sopa.io.cosmx function):
+├── CellStatsDir ( = `DATA_DIR`)
+│   ├── CellLabels
+│   │    ├── CellLabels_F001.tif
+│   │    ├── ...
+│   │    └── CellLabels_F<last_fov_id>.tif
+│   ├── Morphology2D
+│   │    ├── <some_id>_F001.tif
+│   │    ├── ...
+│   │    └── <some_id>_F<last_fov_id>.tif
+│   ├── <dataset_id>_exprMat_file.csv
+│   ├── <dataset_id>_fov_positions_file.csv
+│   ├── <dataset_id>_metadata_file.csv
+│   ├── <dataset_id>_tx_file.csv
+│   └── <dataset_id>-polygons.csv
+├── (AnalysisResults)
+└── (RunSummary)
+
+The "flat files" are the csv files that start with `<dataset_id>_`. They are generated by the AtoMx software.
+
+
+For different datasets we face slightly different data structures, the inputs to the component need to be chosen accordingly.
+If the raw files and the flat files are in different zip files, we need to set par["input_flat_files"]. The other structural
+differences are handled by the component.
+
+### Version 1 (Example: CosMx Liver and Liver cancer) ###
+├── CellStatsDir
+│   ├── FOV001
+│   │    └── CellLabels_F001.tif
+│   ├── ...
+│   ├── FOV130
+│   │    └── CellLabels_F130.tif
+│   ├── Morphology2D
+│   │    └── ...
+│   ├── <dataset_id>_exprMat_file.csv
+│   ├── <dataset_id>_fov_positions_file.csv
+│   ├── <dataset_id>_metadata_file.csv
+│   ├── <dataset_id>_tx_file.csv
+│   └── <dataset_id>-polygons.csv
+├── (AnalysisResults)
+└── (RunSummary)
+--> the CellLabels folder is not present, but the CellLabels_FXXX.tif files are in the FOV folders. 
+    They are moved to a newly created CellLabels folder.
+
+### Version 2 (Example: CosMx Mouse brain) ###
+├── CellStatsDir
+│   ├── FOV001
+│   │    └── CellLabels_F001.tif
+│   ├── ...
+│   ├── FOV130
+│   │    └── CellLabels_F130.tif
+│   ├── Morphology2D
+│   │    └── ...
+├── (AnalysisResults)
+└── (RunSummary)
+&
+├── <dataset_id>_exprMat_file.csv
+├── <dataset_id>_fov_positions_file.csv
+├── <dataset_id>_metadata_file.csv
+├── <dataset_id>_tx_file.csv
+└── <dataset_id>-polygons.csv
+--> the flat files are in a separate zip, they need to be moved to CellStatsDir ( = `DATA_DIR`)
+--> as in version 1, the CellLabels folder is not present, but the CellLabels_FXXX.tif files are in the FOV folders. 
+
+
+
+### Version 3 (Example: CosMx lung cancer) ###
+this has subdirectories for each sample and an extra sub directories for morphology images. Also, 
+the images are given for each z plane. This dataset is covered with the bruker_cosmx_nsclc dataloader.
+
+
+
+
+
+"""
+
+import os
+import shutil
+import zipfile
+from pathlib import Path
+from datetime import datetime
+import sopa
+
+## VIASH START
+par = {
+    "input_raw": "https://smi-public.objects.liquidweb.services/HalfBrain.zip",
+    "input_flat_files": "https://smi-public.objects.liquidweb.services/Half%20%20Brain%20simple%20%20files%20.zip",
+    "segmentation_id": ["cell"],
+    "output": "output.zarr",
+    "dataset_id": "bruker_cosmx/bruker_mouse_brain_cosmx/rep1",
+    "dataset_name": "value",
+    "dataset_url": "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/cosmx-smi-mouse-brain-ffpe-dataset/",
+    "dataset_reference": "value",
+    "dataset_summary": "value",
+    "dataset_description": "value",
+    "dataset_organism": "human",
+}
+meta = {
+    #"temp_dir": "./temp/datasets/bruker_cosmx",
+    "temp_dir": "/Volumes/Sandisk2TB/G3_temp/bruker_cosmx/test_folder"
+}
+
+## VIASH END
+
+assert ("cell" in par["segmentation_id"]) and (len(par["segmentation_id"]) == 1), "Currently cell labels are definitely assigned in this script. And cosmx does not provide other segmentations."
+
+t0 = datetime.now()
+
+# Define temp dir and file names
+TMP_DIR = Path(meta["temp_dir"] or "/tmp")
+TMP_DIR.mkdir(parents=True, exist_ok=True)
+FILE_NAME_RAW = TMP_DIR / par["input_raw"].split("/")[-1]
+DATA_DIR = FILE_NAME_RAW.parent / FILE_NAME_RAW.stem / "CellStatsDir"
+
+if par["input_flat_files"] is not None:
+    FILE_NAME_FLAT = TMP_DIR / par["input_flat_files"].split("/")[-1].replace("%20", " ")
+
+# Download raw files
+print(datetime.now() - t0, "Download raw files", flush=True)
+os.system(f"wget {par['input_raw']} -O '{FILE_NAME_RAW}'")
+
+# Extract zip files
+print(datetime.now() - t0, "Extract zip of raw files", flush=True)
+with zipfile.ZipFile(FILE_NAME_RAW, 'r') as zip_ref:
+    zip_ref.extractall(TMP_DIR)
+
+# Download and extract flat files if they are not already present
+FLAT_FILES_ENDINGS = ["_exprMat_file.csv", "_fov_positions_file.csv", "_metadata_file.csv", "_tx_file.csv"] #, "polygons.csv"]
+flat_files_count = 0
+for ending in FLAT_FILES_ENDINGS:
+    if any(f.endswith(ending) for f in os.listdir(DATA_DIR)):
+        print(f"Flat file with ending {ending} already present in extracted raw files", flush=True)
+        flat_files_count += 1
+
+if flat_files_count == len(FLAT_FILES_ENDINGS):
+    print(datetime.now() - t0, "All flat files already present in extracted raw files", flush=True)
+else:
+    print(datetime.now() - t0, "Download and extract flat files", flush=True)
+    os.system(f"wget {par['input_flat_files']} -O '{FILE_NAME_FLAT}'")
+
+    with zipfile.ZipFile(FILE_NAME_FLAT, 'r') as zip_ref:
+        zip_ref.extractall(TMP_DIR)
+
+    print(datetime.now() - t0, f"Move flat files to {DATA_DIR}", flush=True)
+    source_dir = FILE_NAME_FLAT.parent / FILE_NAME_FLAT.stem
+
+    file_names = os.listdir(source_dir)
+    for file_name in file_names:
+        if not (DATA_DIR / file_name).exists():
+            shutil.move(source_dir / file_name, DATA_DIR)
+        else:
+            print(datetime.now() - t0, f"File {file_name} already present in {DATA_DIR}", flush=True)
+
+
+# Move CellLabels_FXXX.tif files to CellLabels folder if they are not already present
+labels_dir = DATA_DIR / "CellLabels"
+
+if not labels_dir.exists():
+    print(datetime.now() - t0, "Create CellLabels folder with CellLabels tif", flush=True)
+    # Create CellLabels folder with CellLabels tif (somehow this folder name is expected and this is not always present)
+    # see e.g. late discussion in https://github.com/gustaveroussy/sopa/issues/285
+
+    labels_dir.mkdir(parents=True, exist_ok=True)
+
+    # Get all folders in data_dir that start with "FOV" and move the CellLabels_FXXX.tif file to the CellLabels folder
+    print(datetime.now() - t0, "Move CellLabels_FXXX.tif files to CellLabels folder", flush=True)
+    for fov_dir in DATA_DIR.glob("FOV*"):
+        fov_id = str(fov_dir)[-3:]
+        shutil.copy(fov_dir / f"CellLabels_F{fov_id}.tif", labels_dir / f"CellLabels_F{fov_id}.tif")
+else:
+    print(datetime.now() - t0, "CellLabels folder already present", flush=True)
+
+
+
+#########################################
+# Convert raw files to spatialdata zarr #
+#########################################
+
+
+#from pathlib import Path
+#import sopa
+#data_dir = Path("/Volumes/Sandisk2TB/G3_temp/bruker_cosmx/HalfBrain/CellStatsDir")
+
+print(datetime.now() - t0, "Convert raw files to spatialdata zarr", flush=True)
+
+sdata = sopa.io.cosmx(
+    DATA_DIR, 
+    dataset_id=None, 
+    fov=None, 
+    read_proteins=False, 
+    cells_labels=True, 
+    cells_table=True, 
+    cells_polygons=True, 
+    flip_image=False
+)
+
+
+###############
+# Rename keys #
+###############
+print(datetime.now() - t0, "Rename keys", flush=True)
+
+elements_renaming_map = {
+    "stitched_image"     : "morphology_mip", 
+    "stitched_labels"    : "cell_labels",
+    "points"             : "transcripts",
+    "cells_polygons"     : "cell_boundaries",
+    "table"              : "metadata",
+}
+
+for old_key, new_key in elements_renaming_map.items():
+    sdata[new_key] = sdata[old_key]
+    del sdata[old_key]
+
+# Rename transcript column (somehow overwriting the 'target' column leads to an error, so instead we add a duplicate with the right name)
+#sdata['transcripts'] = sdata['transcripts'].rename(columns={"global_cell_id":"cell_id", "target":"feature_name"})
+sdata['transcripts'] = sdata['transcripts'].rename(columns={"global_cell_id":"cell_id"})
+sdata['transcripts']["feature_name"] = sdata['transcripts']["target"]
+
+#########################################
+# Throw out all channels except of DAPI #
+######################################### NOTE: We assume the "DNA" stain is comparable to DAPI.
+print(datetime.now() - t0, "Throw out all channels except of 'DNA' (DAPI?)", flush=True)
+
+# TODO: in the future we want to keep PolyT and Cellbound1/2/3 stains. Note however, that somehow saving or plotting the sdata fails when
+#       these channels aren't excluded, not sure why...
+sdata["morphology_mip"] = sdata["morphology_mip"].sel(c=["DNA"])
+
+
+##############################
+# Add info to metadata table #
+##############################
+print(datetime.now() - t0, "Add info to metadata table", flush=True)
+
+#TODO: values as input variables
+for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism", "segmentation_id"]:
+    sdata["metadata"].uns[key] = par[key]
+
+#########
+# Write #
+#########
+print(datetime.now() - t0, f"Writing to {par['output']}", flush=True)
+
+sdata.write(par["output"])
+
+print(datetime.now() - t0, "Done", flush=True)