lanl
diff --git a/‎.gitlab-ci.yml‎
Lines changed: 0 additions & 41 deletions b/‎.gitlab-ci.yml‎
Lines changed: 0 additions & 41 deletions
diff --git a/‎README.md‎
Lines changed: 61 additions & 3 deletions b/‎README.md‎
Lines changed: 61 additions & 3 deletions
diff --git a/‎neat_ml/data/opencv_detection_test.yaml‎
Lines changed: 12 additions & 0 deletions b/‎neat_ml/data/opencv_detection_test.yaml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎neat_ml/opencv/__init__.py‎ b/‎neat_ml/opencv/__init__.py‎
diff --git a/‎neat_ml/opencv/detection.py‎
Lines changed: 194 additions & 0 deletions b/‎neat_ml/opencv/detection.py‎
Lines changed: 194 additions & 0 deletions
@@ -2,14 +2,72 @@
 
 ## Installing the project
 
-To install the project, clone the repository and install
-the package, core dependencies, and optional dependencies
-by calling:
+Download the package from the following GitHub repository:  
+
+```bash
+git clone git@github.com:lanl/ldrd_neat_ml.git
+```
+
+Install the project, core dependencies,
+and optional dependencies by calling:
 
 ```
 python -m pip install -v ".[dev]" 
 ```
 
+## Writing a `.yaml` input file for OpenCV detection
+
+The workflow takes as input a `.yaml` configuration file with information
+on where to find the input image data for blob detection; save the
+output images.
+
+The `.yaml` file should follow the format below (an example
+can be found at `neat_ml/data/opencv_detection_test.yaml`)
+Input paths can be provided as either absolute or relative
+file paths.
+
+```yaml
+roots:
+  work: path/to/save/output
+
+datasets:
+  - id: name_of_save_folder
+    method: Currently only supports ``OpenCV`` (or ``opencv``) as input
+    class: subfolder_for_image_class
+    time_label: subfolder_for_timestamp
+
+    detection:
+      img_dir: path/to/image/data (Can be a directory of ``.tiff`` images or a path to a single ``.tiff`` image.)
+      debug: True/False for debug (`True` will save side-by-side figure
+             of raw image next to bounding box overlay.)
+```
+
+## Running OpenCV detection
+
+To run the workflow with a given `.yaml` file: 
+
+`python run_workflow.py --config <YAML file> --steps detect`
+
+To run the workflow using ``opencv_detection_test.yaml``:
+
+1. download and install the project
+2. run the test-suite to download the test images from `pooch`
+3. run the following command to get the path where the images are stored
+
+```
+python -c "import pooch; print(pooch.os_cache('test_images'))"
+```
+
+4. replace ``datatsets:detection:img_dir`` `path/to/pooch/images` in the `.yaml` with the local filepath
+5. call the `run_workflow` command with `--config neat_ml/data/opencv_detection_test.yaml`
+
+This should process and detect bubbles from the image file `images_raw.tiff` and 
+place the outputs under ``roots:work`` filepath from the `.yaml` file
+
+For information relevant to running the workflow:  
+
+`python run_workflow.py --help`
+
 ## Running the Main ML workflow
 
 Note that the first incantation of the main ML
 
@@ -0,0 +1,12 @@
+roots:
+  work: neat_ml/tests/data/
+
+datasets:
+  - id: Test
+    method: OpenCV
+    class: ''
+    time_label: ''
+    
+    detection:
+      img_dir: 'path/to/pooch/images/images_raw.tiff'
+      debug: True
@@ -0,0 +1,194 @@
+from pathlib import Path
+from typing import Sequence
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import skimage.color
+from joblib import Memory
+from tqdm.auto import tqdm
+
+
+__all__: Sequence[str] = [
+    "run_opencv"
+] 
+
+
+def _detect_single_image(
+    img_path: Path
+) -> tuple[int, float, pd.DataFrame]:
+    """
+    Detect bubbles in a single image using OpenCV's SimpleBlobDetector.
+
+    Parameters
+    ----------
+    img_path : Path
+        Absolute file path to the image to process.
+
+    Returns
+    -------
+    tuple[int, float, pd.DataFrame]
+        num_blobs : int
+            Number of blobs detected in the image.
+        median_radius : float
+            Median radius of detected blobs (NaN if none).
+        bubble_data : pd.DataFrame
+            DataFrame with one row per detected blob, each containing:
+            - 'bubble_number' (int)
+            - 'center' (tuple[float, float])
+            - 'radius' (float)
+            - 'area' (float)
+            - 'bbox' (tuple[int, int, int, int])
+    """
+    image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # type: ignore[call-overload]
+    if image is None:
+        raise FileNotFoundError(f"Unable to read image file: {img_path}")
+
+    # the parameters for ``SimpleBlobDetector`` were determined manually
+    # by hand-tuning via visual inspection NOT by using a standardized
+    # hyperparameter optimization method (see: issue #13)
+    params = cv2.SimpleBlobDetector_Params() # type: ignore[attr-defined]
+    params.minThreshold = 10
+    params.maxThreshold = 200
+    params.thresholdStep = 10
+    params.filterByColor = False
+    params.filterByArea = True
+    params.minArea = 20
+    params.maxArea = 50000
+    params.filterByCircularity = True
+    params.minCircularity = 0.75
+    params.filterByConvexity = True
+    params.minConvexity = 0.80
+    params.filterByInertia = True
+    params.minInertiaRatio = 0.75
+
+    detector = cv2.SimpleBlobDetector_create(params) # type: ignore[attr-defined]
+    keypoints = detector.detect(image)
+
+    bubble_data = pd.DataFrame(index=range(len(keypoints)),
+        columns=["bubble_number", "center", "radius", "area", "bbox"]).fillna(np.nan)
+    bubble_data[['center', 'bbox']] = bubble_data[['center', 'bbox']].astype('object')
+    for idx, kp in enumerate(keypoints):
+        cx, cy = kp.pt
+        r = kp.size / 2.0
+        bbox = (cx - r, cy - r, cx + r, cy + r)
+        bubble_data_row = {
+            "bubble_number": idx + 1,
+            "center": (cx, cy),
+            "radius": r,
+            "area": np.pi * r**2,
+            "bbox": bbox,
+        }
+        bubble_data.loc[idx] = pd.Series(bubble_data_row)
+
+    num_blobs = len(keypoints)
+    median_radius = np.nanmedian(bubble_data["radius"])
+    return num_blobs, median_radius, bubble_data
+
+def _save_debug_overlay(
+    img_path: Path,
+    bubble_data: pd.DataFrame,
+    out_dir: Path,
+) -> None:
+    """
+    Create and save a side-by-side figure containing the original
+    image next to the image overlaid with opencv segmentation mask
+
+    Parameters
+    ----------
+    img_path : Path
+        File path to the original image.
+    bubble_data : pd.DataFrame
+        DataFrame of bubble metadata as returned by _detect_single_image.
+    out_dir : Path
+        Directory where the debug PNG will be saved.
+    """
+    image_gray = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # type: ignore[call-overload]
+
+    if image_gray is None:
+        raise FileNotFoundError(f"Could not read image for debug overlay: {img_path}")
+    
+    image_rgb = skimage.color.gray2rgb(image_gray)
+
+    overlay = image_rgb.copy()
+
+    for index, bubble in bubble_data.iterrows():
+        bbox = bubble["bbox"]
+        x_min, y_min, x_max, y_max = map(int, bbox)
+        cv2.rectangle(overlay, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
+
+    fig, ax = plt.subplots(1, 2, figsize=(12, 8))
+    ax[0].imshow(image_rgb)
+    ax[0].set_title("Original")
+    ax[0].axis("off")
+    ax[1].imshow(overlay)
+    ax[1].set_title("Detected blobs")
+    ax[1].axis("off")
+
+    png_name = f"{img_path.stem}_debug.png"
+    fig.savefig(out_dir / png_name, dpi=300, bbox_inches="tight")
+    plt.close(fig)
+
+def run_opencv(
+    df: pd.DataFrame,
+    out_dir: Path,
+    debug: bool = False,
+) -> pd.DataFrame:
+    """
+    Detect bubbles in every image referenced by df using the
+    OpenCV ``SimpleBlobDetector``.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        dataframe containing absolute image filepaths.
+    out_dir : Path
+        Path to save the outputs.
+    debug : bool
+        If True save side by side diagnostic images.
+
+    Returns
+    -------
+    df_out : pandas.DataFrame
+        Copy of df enriched with: 
+            num_blobs_opencv number of blobs detected
+            median_radii_opencv median droplet radius
+    """
+    if 'image_filepath' not in df.columns:
+        raise ValueError("DataFrame must contain 'image_filepath' column.")
+    
+    if not out_dir.is_absolute():
+        raise ValueError(
+            f"Absolute file path required, got {out_dir}"
+        )
+    
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    memory = Memory(location=out_dir / ".joblib_cache", verbose=0)
+    cached_detect = memory.cache(_detect_single_image)
+
+    df_out = df.copy()
+    df_out[["num_blobs_opencv", "median_radii_opencv"]] = np.nan
+   
+    for idx, row in tqdm(
+        df_out.iterrows(),
+        total=df_out.shape[0],
+        desc="OpenCV SimpleBlobDetector",
+    ):
+        img_path = row.image_filepath
+
+        num_blobs, median_r, bubble_data = cached_detect(img_path)
+
+        df_bubbles = pd.DataFrame(bubble_data)
+        df_bubbles.to_parquet(
+            out_dir / f"{img_path.stem}_bubble_data.parquet.gzip",
+            compression="gzip")
+
+        if debug:
+            _save_debug_overlay(img_path, bubble_data, out_dir)
+
+        df_out.loc[idx, "num_blobs_opencv"] = num_blobs  # type: ignore[index]
+        df_out.loc[idx, "median_radii_opencv"] = median_r  # type:ignore[index]
+
+    return df_out