Merge pull request #150 from computational-cell-analytics/finetune-example

constantinpape · web-flow · commit b5face55fe67 · 2023-08-17T16:43:38.000+02:00
Finetune example
diff --git a/README.md b/README.md
@@ -16,13 +16,7 @@ We implement napari applications for:
 <img src="https://github.com/computational-cell-analytics/micro-sam/assets/4263537/dfca3d9b-dba5-440b-b0f9-72a0683ac410" width="256">
 <img src="https://github.com/computational-cell-analytics/micro-sam/assets/4263537/aefbf99f-e73a-4125-bb49-2e6592367a64" width="256">
 
-**Beta version**
-
-This is an advanced beta version. While many features are still under development, we aim to keep the user interface and python library stable.
-Any feedback is welcome, but please be aware that the functionality is under active development and that some features may not be thoroughly tested yet.
-We will soon provide a stand-alone application for running the `micro_sam` annotation tools, and plan to also release it as [napari plugin](https://napari.org/stable/plugins/index.html) in the future.
-
-If you run into any problems or have questions please open an issue on Github or reach out via [image.sc](https://forum.image.sc/) using the tag `micro-sam` and tagging @constantinpape.
+If you run into any problems or have questions regarding our tool please open an issue on Github or reach out via [image.sc](https://forum.image.sc/) using the tag `micro-sam` and tagging @constantinpape.
 
 
 ## Installation and Usage
diff --git a/examples/README.md b/examples/README.md
@@ -6,5 +6,8 @@ Examples for using the micro_sam annotation tools:
 - `annotator_tracking.py`: run the interactive tracking annotation tool
 - `image_series_annotator.py`: run the annotation tool for a series of images
 
+The folder `finetuning` contains example scripts that show how a Segment Anything model can be fine-tuned
+on custom data with the `micro_sam.train` library, and how the finetuned models can then be used within the annotatin tools.
+
 The folder `use_as_library` contains example scripts that show how `micro_sam` can be used as a python
 library to apply Segment Anything to mult-dimensional data.
diff --git a/examples/finetuning/.gitignore b/examples/finetuning/.gitignore
@@ -0,0 +1,2 @@
+checkpoints/
+logs/
diff --git a/examples/finetuning/finetune_hela.py b/examples/finetuning/finetune_hela.py
@@ -0,0 +1,140 @@
+import os
+
+import numpy as np
+import torch
+import torch_em
+
+import micro_sam.training as sam_training
+from micro_sam.sample_data import fetch_tracking_example_data, fetch_tracking_segmentation_data
+from micro_sam.util import export_custom_sam_model
+
+DATA_FOLDER = "data"
+
+
+def get_dataloader(split, patch_shape, batch_size):
+    """Return train or val data loader for finetuning SAM.
+
+    The data loader must be a torch data loader that retuns `x, y` tensors,
+    where `x` is the image data and `y` are the labels.
+    The labels have to be in a label mask instance segmentation format.
+    I.e. a tensor of the same spatial shape as `x`, with each object mask having its own ID.
+    Important: the ID 0 is reseved for background, and the IDs must be consecutive
+
+    Here, we use `torch_em.default_segmentation_loader` for creating a suitable data loader from
+    the example hela data. You can either adapt this for your own data (see comments below)
+    or write a suitable torch dataloader yourself.
+    """
+    assert split in ("train", "val")
+    os.makedirs(DATA_FOLDER, exist_ok=True)
+
+    # This will download the image and segmentation data for training.
+    image_dir = fetch_tracking_example_data(DATA_FOLDER)
+    segmentation_dir = fetch_tracking_segmentation_data(DATA_FOLDER)
+
+    # torch_em.default_segmentation_loader is a convenience function to build a torch dataloader
+    # from image data and labels for training segmentation models.
+    # It supports image data in various formats. Here, we load image data and labels from the two
+    # folders with tif images that were downloaded by the example data functionality, by specifying
+    # `raw_key` and `label_key` as `*.tif`. This means all images in the respective folders that end with
+    # .tif will be loadded.
+    # The function supports many other file formats. For example, if you have tif stacks with multiple slices
+    # instead of multiple tif images in a foldder, then you can pass raw_key=label_key=None.
+
+    # Load images from multiple files in folder via pattern (here: all tif files)
+    raw_key, label_key = "*.tif", "*.tif"
+    # Alternative: if you have tif stacks you can just set raw_key and label_key to None
+    # raw_key, label_key= None, None
+
+    # The 'roi' argument can be used to subselect parts of the data.
+    # Here, we use it to select the first 70 frames fro the test split and the other frames for the val split.
+    if split == "train":
+        roi = np.s_[:70, :, :]
+    else:
+        roi = np.s_[70:, :, :]
+
+    loader = torch_em.default_segmentation_loader(
+        raw_paths=image_dir, raw_key=raw_key,
+        label_paths=segmentation_dir, label_key=label_key,
+        patch_shape=patch_shape, batch_size=batch_size,
+        ndim=2, is_seg_dataset=True, rois=roi,
+        label_transform=torch_em.transform.label.connected_components,
+    )
+    return loader
+
+
+def run_training(checkpoint_name, model_type):
+    """Run the actual model training."""
+
+    # All hyperparameters for training.
+    batch_size = 1  # the training batch size
+    patch_shape = (1, 512, 512)  # the size of patches for training
+    n_objects_per_batch = 25  # the number of objects per batch that will be sampled
+    device = torch.device("cuda")  # the device/GPU used for training
+    n_iterations = 10000  # how long we train (in iterations)
+
+    # Get the dataloaders.
+    train_loader = get_dataloader("train", patch_shape, batch_size)
+    val_loader = get_dataloader("val", patch_shape, batch_size)
+
+    # Get the segment anything model, the optimizer and the LR scheduler
+    model = sam_training.get_trainable_sam_model(model_type=model_type, device=device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
+    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.9, patience=10, verbose=True)
+
+    # This class creates all the training data for a batch (inputs, prompts and labels).
+    convert_inputs = sam_training.ConvertToSamInputs()
+
+    # the trainer which performs training and validation (implemented using "torch_em")
+    trainer = sam_training.SamTrainer(
+        name=checkpoint_name,
+        train_loader=train_loader,
+        val_loader=val_loader,
+        model=model,
+        optimizer=optimizer,
+        # currently we compute loss batch-wise, else we pass channelwise True
+        loss=torch_em.loss.DiceLoss(channelwise=False),
+        metric=torch_em.loss.DiceLoss(),
+        device=device,
+        lr_scheduler=scheduler,
+        logger=sam_training.SamLogger,
+        log_image_interval=10,
+        mixed_precision=True,
+        convert_inputs=convert_inputs,
+        n_objects_per_batch=n_objects_per_batch,
+        n_sub_iteration=8,
+        compile_model=False
+    )
+    trainer.fit(n_iterations)
+
+
+def export_model(checkpoint_name, model_type):
+    """Export the trained model."""
+    # export the model after training so that it can be used by the rest of the micro_sam library
+    export_path = "./finetuned_hela_model.pth"
+    checkpoint_path = os.path.join("checkpoints", checkpoint_name, "best.pt")
+    export_custom_sam_model(
+        checkpoint_path=checkpoint_path,
+        model_type=model_type,
+        save_path=export_path,
+    )
+
+
+def main():
+    """Finetune a Segment Anything model.
+
+    This example uses image data and segmentations from the cell tracking challenge,
+    but can easily be adapted for other data (including data you have annoated with micro_sam beforehand).
+    """
+    # The model_type determines which base model is used to initialize the weights that are finetuned.
+    # We use vit_b here because it can be trained faster. Note that vit_h usually yields higher quality results.
+    model_type = "vit_b"
+
+    # The name of the checkpoint. The checkpoints will be stored in './checkpoints/<checkpoint_name>'
+    checkpoint_name = "sam_hela"
+
+    run_training(checkpoint_name, model_type)
+    export_model(checkpoint_name, model_type)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/finetuning/use_finetuned_model.py b/examples/finetuning/use_finetuned_model.py
@@ -0,0 +1,33 @@
+import imageio.v3 as imageio
+
+import micro_sam.util as util
+from micro_sam.sam_annotator import annotator_2d
+
+
+def run_annotator_with_custom_model():
+    """Run the 2d anntator with a custom (finetuned) model.
+
+    Here, we use the model that is produced by `finetuned_hela.py` and apply it
+    for an image from the validation set.
+    """
+    # take the last frame, which is part of the val set, so the model was not directly trained on it
+    im = imageio.imread("./data/DIC-C2DH-HeLa.zip.unzip/DIC-C2DH-HeLa/01/t083.tif")
+
+    # set the checkpoint and the path for caching the embeddings
+    checkpoint = "./finetuned_hela_model.pth"
+    embedding_path = "./embeddings/embeddings-finetuned.zarr"
+
+    model_type = "vit_b"  # We finetune a vit_b in the example script.
+    # Adapt this if you finetune a different model type, e.g. vit_h.
+
+    # Load the custom model.
+    predictor = util.get_sam_model(model_type=model_type, checkpoint_path=checkpoint)
+
+    # Run the 2d annotator with the custom model.
+    annotator_2d(
+        im, embedding_path=embedding_path, predictor=predictor, precompute_amg_state=True,
+    )
+
+
+if __name__ == "__main__":
+    run_annotator_with_custom_model()
diff --git a/finetuning/README.md b/finetuning/README.md
@@ -1,8 +1,8 @@
 # Segment Anything Finetuning
 
-Preliminary examples for fine-tuning segment anything on custom datasets.
+Code for finetuning segment anything data on microscopy data and evaluating the finetuned models.
 
-## LiveCELL
+## Example: LiveCELL
 
 **Finetuning**
 
@@ -47,3 +47,12 @@ E.g. run the script like below to evaluate the previous predictions.
 python livecell_evaluation.py -i /scratch/projects/nim00007/data/LiveCELL -e experiment
 ```
 This will create a folder `experiment/results` with csv tables with the results per cell type and averaged over all images.
+
+
+## Finetuning and evaluation code
+
+The subfolders contain the code for different finetuning and evaluation experiments for microscopy data:
+- `livecell`: TODO
+- `generalist`: TODO
+
+Note: we still need to clean up most of this code and will add it later.
diff --git a/micro_sam/sample_data.py b/micro_sam/sample_data.py
@@ -9,7 +9,7 @@
 import pooch
 
 
-def fetch_image_series_example_data(save_directory: Union[str, os.PathLike]) -> Union[str, os.PathLike]:
+def fetch_image_series_example_data(save_directory: Union[str, os.PathLike]) -> str:
     """Download the sample images for the image series annotator.
 
     Args:
@@ -36,7 +36,7 @@ def fetch_image_series_example_data(save_directory: Union[str, os.PathLike]) ->
     return data_folder
 
 
-def fetch_wholeslide_example_data(save_directory: Union[str, os.PathLike]) -> Union[str, os.PathLike]:
+def fetch_wholeslide_example_data(save_directory: Union[str, os.PathLike]) -> str:
     """Download the sample data for the 2d annotator.
 
     This downloads part of a whole-slide image from the NeurIPS Cell Segmentation Challenge.
@@ -61,7 +61,7 @@ def fetch_wholeslide_example_data(save_directory: Union[str, os.PathLike]) -> Un
     return os.path.join(save_directory, fname)
 
 
-def fetch_livecell_example_data(save_directory: Union[str, os.PathLike]) -> Union[str, os.PathLike]:
+def fetch_livecell_example_data(save_directory: Union[str, os.PathLike]) -> str:
     """Download the sample data for the 2d annotator.
 
     This downloads a single image from the LiveCELL dataset.
@@ -86,7 +86,7 @@ def fetch_livecell_example_data(save_directory: Union[str, os.PathLike]) -> Unio
     return os.path.join(save_directory, fname)
 
 
-def fetch_hela_2d_example_data(save_directory: Union[str, os.PathLike]) -> Union[str, os.PathLike]:
+def fetch_hela_2d_example_data(save_directory: Union[str, os.PathLike]) -> str:
     """Download the sample data for the 2d annotator.
 
     This downloads a single image from the HeLa CTC dataset.
@@ -110,7 +110,7 @@ def fetch_hela_2d_example_data(save_directory: Union[str, os.PathLike]) -> Union
     return os.path.join(save_directory, fname)
 
 
-def fetch_3d_example_data(save_directory: Union[str, os.PathLike]) -> Union[str, os.PathLike]:
+def fetch_3d_example_data(save_directory: Union[str, os.PathLike]) -> str:
     """Download the sample data for the 3d annotator.
 
     This downloads the Lucchi++ datasets from https://casser.io/connectomics/.
@@ -139,7 +139,7 @@ def fetch_3d_example_data(save_directory: Union[str, os.PathLike]) -> Union[str,
     return str(lucchi_dir)
 
 
-def fetch_tracking_example_data(save_directory: Union[str, os.PathLike]) -> Union[str, os.PathLike]:
+def fetch_tracking_example_data(save_directory: Union[str, os.PathLike]) -> str:
     """Download the sample data for the tracking annotator.
 
     This data is the cell tracking challenge dataset DIC-C2DH-HeLa.
@@ -171,3 +171,32 @@ def fetch_tracking_example_data(save_directory: Union[str, os.PathLike]) -> Unio
     cell_tracking_dir = save_directory.joinpath(f"{fname}.unzip", "DIC-C2DH-HeLa", "01")
     assert os.path.exists(cell_tracking_dir)
     return str(cell_tracking_dir)
+
+
+def fetch_tracking_segmentation_data(save_directory: Union[str, os.PathLike]) -> str:
+    """Download groundtruth segmentation for the tracking example data.
+
+    This downloads the groundtruth segmentation for the image data from `fetch_tracking_example_data`.
+
+    Args:
+        save_directory: Root folder to save the downloaded data.
+    Returns:
+        The folder that contains the downloaded data.
+    """
+    save_directory = Path(save_directory)
+    os.makedirs(save_directory, exist_ok=True)
+    print("Example data directory is:", save_directory.resolve())
+    unpack_filenames = [os.path.join("masks", f"mask_{str(i).zfill(4)}.tif") for i in range(84)]
+    unpack = pooch.Unzip(members=unpack_filenames)
+    fname = "hela-ctc-01-gt.zip"
+    pooch.retrieve(
+        url="https://owncloud.gwdg.de/index.php/s/AWxQMblxwR99OjC/download",
+        known_hash="c0644d8ebe1390fb60125560ba15aa2342caf44f50ff0667a0318ea0ac6c958b",
+        fname=fname,
+        path=save_directory,
+        progressbar=True,
+        processor=unpack,
+    )
+    cell_tracking_dir = save_directory.joinpath(f"{fname}.unzip", "masks")
+    assert os.path.exists(cell_tracking_dir)
+    return str(cell_tracking_dir)
diff --git a/micro_sam/util.py b/micro_sam/util.py
@@ -191,7 +191,8 @@ def get_custom_sam_model(
     custom_pickle = pickle
     custom_pickle.Unpickler = _CustomUnpickler
 
-    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
     sam = sam_model_registry[model_type]()
 
     # load the model state, ignoring any attributes that can't be found by pickle
@@ -230,7 +231,9 @@ def export_custom_sam_model(
         model_type: The SegmentAnything model type to use (vit_h, vit_b or vit_l).
         save_path: Where to save the exported model.
     """
-    _, state = get_custom_sam_model(checkpoint_path, model_type=model_type, return_state=True)
+    _, state = get_custom_sam_model(
+        checkpoint_path, model_type=model_type, return_state=True, device=torch.device("cpu"),
+    )
     model_state = state["model_state"]
     prefix = "sam."
     model_state = OrderedDict(