WIP updating DLC-to-poseinterface example

niksirbi · niksirbi · commit 81bba31eeaf3 · 2026-03-27T12:55:36.000Z
diff --git a/examples/SWC-plusmaze_to_benchmark.py b/examples/SWC-plusmaze_to_benchmark.py
@@ -1,7 +1,7 @@
 """Convert DeepLabCut project to benchmark dataset
 ==================================================
 Convert videos and labelled frames from a DeepLabCut (DLC) project to the
-``poseinterface`` benchmark dataset format.
+``poseinterface`` [benchmark dataset format](target-benchmark-dataset).
 
 """
 
@@ -11,39 +11,121 @@
 import shutil
 from pathlib import Path
 
-from poseinterface.io import annotations_to_coco
+from poseinterface.utils import tree
 
 # %%
-# Background
-# ----------
-# We've identified potential datasets from SWC that could be used for the pilot
-# version of the pose benchmark dataset.
-# Among these is the Elevated Plus Maze (EPM) dataset produced by
-# Loukia Katsouri, for John O'Keefe's lab.
+# Project overview
+# ----------------
+# Here we work with the "Elevated Plus Maze (EPM)" project from the
+# [Sainsbury Wellcome Centre(SWC)](https://www.sainsburywellcome.org/),
+# produced by Loukia Katsouri from John O'Keefe's lab.
+#
 # It contains single-animal top-down videos of mice exploring an elevated plus
-# maze, with keypoint annotations and predictions from DeepLabCut (DLC).
+# maze, with keypoint annotations and predictions from
+# [DeepLabCut (DLC)](https://www.mackenziemathislab.org/deeplabcut).
 #
-# In this example, we convert the DLC annotations to COCO .json format.
 
 # %%
-# Define source and target directories
+# Prepare benchmark dataset directories
+# -------------------------------------
+# Remember that:
+# - A benchmark dataset is organised into a ``Train`` and a ``Test`` split.
+# - Each split contains one or more **projects**
+#   (i.e. datasets contributed by different groups).
+#
+# Here we create a ``poseinterface_benchmarks`` directory to hold all of our
+# projects, with subfolders for ``Train`` and ``Test`` splits.
+# In each split, we create a folder named ``SWC-plusmaze`` to hold converted
+# files from the project described above. If any of these directories
+# already exist, they will be left unchanged.
+
+benchmark_base_dir = Path("/mnt/Data/poseinterface_benchmarks")
+project_name = "SWC-plusmaze"
+
+for split in ["Train", "Test"]:
+    split_dir = benchmark_base_dir / split / project_name
+    split_dir.mkdir(parents=True, exist_ok=True)
+
+# print the directory structure as a tree
+print(tree(benchmark_base_dir, level=2))
+
+# %%
+# Define source DLC project directory
 # ------------------------------------
-# We specify the paths to the source DLC project directory
-# as well as the target directory where converted files will be saved.
-# The target will be organised in the pose benchmarks dataset structure.
+# We specify the path to the source DLC project directory.
 
 source_base_dir = Path(
-    "/media/ceph-niu/neuroinformatics/sirmpilatzen/behav_data"
-    "/Loukia/MASTER_DoNotModify"
+    "/media/ceph-niu/neuroinformatics/sirmpilatzen/behav_data/Loukia/"
+    "MASTER_DoNotModify"
 )
 source_project_dir = source_base_dir / "MouseTopDown-Loukia-2022-09-13"
 assert source_project_dir.exists(), (
     f"DLC project directory not found: {source_project_dir}"
 )
 
-target_base_dir = Path("/mnt/Data/pose_benchmarks")
-target_dataset_dir = target_base_dir / "SWC-plusmaze"
-target_dataset_dir.mkdir(parents=True, exist_ok=True)
+# Print a tree representation of the DLC project directory structure
+print(tree(source_project_dir, level=1, exclude_hidden=True))
+
+# %%
+# Of the various sub-directories, we are interested in the following:
+# - ``videos``: contains the video files as well as the corresponding
+#   preditictions from DLC.
+# - ``labeled-data``: contains the frames used for labeling and the
+#   corresponding annotations in csv format.
+
+print(
+    tree(
+        source_project_dir / "videos",
+        level=1,
+        length_limit=14,
+        exclude_hidden=True,
+    )
+)
+
+# %%
+# We see that for each video file (ending in ``converted.mp4``), there are
+# corresponding DLC predictions in .h5 and .csv formats (both raw and
+# filtered versions). The files ending in ``.labeled.mp4`` are videos with the
+# DLC predictions overlaid, which we won't need.
+#
+# The video filenames in this project start with strings like
+# ``M708149_EPM_20200317_``. We'll use ``M708149`` as the subject identifier
+# and ``20200317`` (date in YYYYMMDD format) as the session identifier.
+# This DLC project contains videos acquired in multiple different experimental
+# setups, but we'll only focus on those containing the string ``EPM``
+# (elevated plus maze) in the filename, which are all top-down videos.
+
+# %%
+# What about the ``labeled-data`` directory?
+
+print(
+    tree(
+        source_project_dir / "labeled-data",
+        level=2,
+        length_limit=10,
+        exclude_hidden=True,
+    )
+)
+
+# %%
+# We see that there are sub-directories named after the videos (without the
+# ``.mp4`` extension), which contain the frames used for labeling and the
+# corresponding annotations (for all frames sampled from this video)
+# in .csv and .h5 formats.
+
+# %%
+# Convert a single video and its corresponding frame annotations
+# ---------------------------------------------------------------
+# For now, let's focus on converting a single EPM video.
+
+source_video_name = "M708149_EPM_20200317_165049331-converted.mp4"
+
+subject_id = source_video_name.split("_")[0]  # "M708149"
+session_id = source_video_name.split("_")[2]  # "20200317"
+camera_id = "topdown"
+
+source_video_path = source_project_dir / "videos" / source_video_name
+assert source_video_path.exists(), f"Video file not found: {source_video_path}"
 
 # %%
 # Copy video to target location
@@ -54,77 +136,27 @@
 source_video_name = "M708149_EPM_20200317_165049331-converted.mp4"
 source_video_path = source_project_dir / "videos" / source_video_name
 
-# Define subject, session, and view identifiers
+# Define subject, session, and camera view identifiers
 subject_id = "M708149"
 session_id = "20200317"
-view_id = "topdown"
-video_id = f"sub-{subject_id}_ses-{session_id}_view-{view_id}"
+camera_id = "topdown"
 
-# Create target session directory
-target_session_dir = target_dataset_dir / f"sub-{subject_id}_ses-{session_id}"
+session_prefix = f"sub-{subject_id}_ses-{session_id}"
+video_prefix = f"{session_prefix}_cam-{camera_id}"
+
+# Create target sessions directory
+target_session_dir = (
+    benchmark_base_dir / "Train" / project_name / session_prefix
+)
 target_session_dir.mkdir(parents=True, exist_ok=True)
 
 # Copy video to target location
-target_video_path = target_session_dir / f"{video_id}.mp4"
+target_video_path = target_session_dir / f"{video_prefix}.mp4"
 if not target_video_path.exists():
     shutil.copy2(source_video_path, target_video_path)
     print(f"Copied video to: {target_video_path}")
 else:
     print(f"Video already exists at: {target_video_path}")
 
-# %%
-# Define source annotations path
-# ------------------------------
-# The first attempt failed because the paths in the DLC annotations
-# csv file were given as
-# ``labeled-data,<video-name>,<filename-with-frame-number>.<extension>``
-# instead of the required
-# ``labeled-data/<video-name>/<filename-with-frame-number>.<extension>``.
-# We fixed this by replacing the commas with slashes in the csv file.
-
-source_labels_dir = (
-    source_project_dir / "labeled-data" / source_video_name.replace(".mp4", "")
-)
-source_annotations_path = source_labels_dir / "CollectedData_Loukia.csv"
-
-# Create Frames directory inside the session directory
-target_frames_dir = target_session_dir / "Frames"
-target_frames_dir.mkdir(parents=True, exist_ok=True)
-
-# Save COCO annotations inside the Frames directory
-target_annotations_path = target_frames_dir / f"{video_id}_framelabels.json"
-
-# %%
-# Convert DLC annotations to COCO format
-# --------------------------------------
-# Here we use the :func:`annotations_to_coco` function from `poseinterface.io`
-# which wraps around `sleap_io` functionality to perform the conversion.
-
-annotations_to_coco(
-    input_path=source_annotations_path,
-    output_json_path=target_annotations_path,
-    sub_id=subject_id,
-    ses_id=session_id,
-    cam_id=view_id,
-)
-print(f"Saved COCO annotations to: {target_annotations_path}")
-
-# %%
-# Copy labeled frames to target directory
-# ---------------------------------------
-# Copy the frames used for labeling and rename them to follow
-# the naming convention:
-# ``sub-{subjectID}_ses-{SessionID}_view-{ViewID}_frame-{FrameID}.png``
-
-for source_frame_path in source_labels_dir.glob("*.png"):
-    # Extract frame number from original filename, e.g. "img0042.png" -> "0042"
-    frame_number = source_frame_path.stem.replace("img", "")
-    target_frame_path = (
-        target_frames_dir / f"{video_id}_frame-{frame_number}.png"
-    )
-    if not target_frame_path.exists():
-        shutil.copy2(source_frame_path, target_frame_path)
-
-print(f"Copied labeled frames to: {target_frames_dir}")
-
-# %%
+# print the directory structure of the target session directory
+print(tree(target_session_dir, level=1))
diff --git a/poseinterface/utils.py b/poseinterface/utils.py
@@ -0,0 +1,99 @@
+"""General utility functions for ``poseinterface``."""
+
+from collections.abc import Iterator
+from itertools import islice
+from pathlib import Path
+
+
+def tree(
+    dir_path: Path,
+    *,
+    level: int = -1,
+    limit_to_directories: bool = False,
+    exclude_hidden: bool = False,
+    length_limit: int = 1000,
+) -> str:
+    """Return a visual tree structure of a directory as a string.
+
+    Parameters
+    ----------
+    dir_path
+        Path to the root directory.
+    level
+        Maximum depth to display. ``-1`` means no limit. Default is ``-1``.
+    limit_to_directories
+        If ``True``, only directories are shown. Default is ``False``.
+    exclude_hidden
+        If ``True``, files and directories starting with ``.`` are excluded.
+        Default is ``False``.
+    length_limit
+        Maximum number of lines to include before truncating.
+        Default is ``1000``.
+
+    Returns
+    -------
+    str
+        Tree representation of the directory structure, including a
+        summary line with the count of directories and files.
+
+    Notes
+    -----
+    Based on https://stackoverflow.com/a/59109706 by Aaron Hall, modified
+    by community (see post 'Timeline' for change history).
+    Retrieved 2026-03-27. License: CC BY-SA 4.0.
+
+    Examples
+    --------
+    >>> from pathlib import Path
+    >>> from poseinterface.utils import tree
+    >>> print(tree(Path(".")))
+    """
+    space = "    "
+    branch = "│   "
+    tee = "├── "
+    last = "└── "
+
+    dir_path = Path(dir_path)
+    files = 0
+    directories = 0
+
+    def _inner(
+        dir_path: Path, prefix: str = "", level: int = -1
+    ) -> Iterator[str]:
+        nonlocal files, directories
+        if not level:
+            return
+        contents = sorted(
+            (
+                d
+                for d in dir_path.iterdir()
+                if not (exclude_hidden and d.name.startswith("."))
+            ),
+            key=lambda d: d.name,
+        )
+        if limit_to_directories:
+            contents = [d for d in contents if d.is_dir()]
+        pointers = [tee] * (len(contents) - 1) + [last]
+        for pointer, path in zip(pointers, contents):
+            if path.is_dir():
+                yield prefix + pointer + path.name
+                directories += 1
+                extension = branch if pointer == tee else space
+                yield from _inner(
+                    path, prefix=prefix + extension, level=level - 1
+                )
+            elif not limit_to_directories:
+                yield prefix + pointer + path.name
+                files += 1
+
+    lines: list[str] = [dir_path.name]
+    iterator = _inner(dir_path, level=level)
+    for line in islice(iterator, length_limit):
+        lines.append(line)
+    if next(iterator, None):
+        lines.append(f"... length_limit, {length_limit}, reached, counted:")
+    summary = f"\n{directories} directories"
+    if files:
+        summary += f", {files} files"
+    lines.append(summary)
+    return "\n".join(lines)
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,6 +11,7 @@ requires-python = ">=3.11.0"
 dynamic = ["version"]
 
 dependencies = [
+  "jupyter>=1.1.1",
   "sleap-io>=0.6.4",
 ]
 

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@ requires-python = ">=3.11.0"`
`11`	`11`	`dynamic = ["version"]`
`12`	`12`
`13`	`13`	`dependencies = [`
	`14`	`+ "jupyter>=1.1.1",`
`14`	`15`	`"sleap-io>=0.6.4",`
`15`	`16`	`]`
`16`	`17`