Merge pull request #1714 from dandi/add-session-duration

yarikoptic · web-flow · commit 471e3ada087c · 2025-10-13T13:14:46.000-04:00
feat: add session end time extraction from NWB files
diff --git a/dandi/metadata/util.py b/dandi/metadata/util.py
@@ -588,6 +588,7 @@ def extract_session(metadata: dict) -> list[models.Session] | None:
             name=session_id or "Acquisition session",
             description=metadata.get("session_description"),
             startDate=metadata.get("session_start_time"),
+            endDate=metadata.get("session_end_time"),
             used=probes,
         )
     ]
diff --git a/dandi/pynwb_utils.py b/dandi/pynwb_utils.py
@@ -2,6 +2,7 @@
 
 from collections import Counter
 from collections.abc import Callable
+from datetime import timedelta
 import inspect
 import os
 import os.path as op
@@ -14,6 +15,7 @@
 from fscacher import PersistentCache
 import h5py
 import hdmf
+import numpy as np
 from packaging.version import Version
 import pynwb
 from pynwb import NWBHDF5IO
@@ -262,9 +264,115 @@ def _get_pynwb_metadata(path: str | Path | Readable) -> dict[str, Any]:
         # get external_file data:
         out["external_file_objects"] = _get_image_series(nwb)
 
+        # Calculate session duration for metadata
+        session_duration = _get_session_duration(nwb)
+        if session_duration is not None and out.get("session_start_time") is not None:
+            # Convert to absolute datetime by adding duration to session_start_time
+            start_time = out["session_start_time"]
+            out["session_end_time"] = start_time + timedelta(seconds=session_duration)
+
     return out
 
 
+def _get_session_duration(nwb: pynwb.NWBFile) -> float | None:
+    """Calculate the duration of a recording session from NWB file contents.
+
+    This function finds the minimum and maximum timestamps across all TimeSeries
+    and DynamicTable objects with time information, then returns the duration as
+    max - min.
+
+    Parameters
+    ----------
+    nwb: pynwb.NWBFile
+        An open NWB file object
+
+    Returns
+    -------
+    float or None
+        The session duration in seconds (max_time - min_time),
+        or None if no time information could be extracted
+    """
+    start_times: list[float] = []
+    end_times: list[float] = []
+
+    # Iterate through all objects in the NWB file
+    for obj in nwb.objects.values():
+        # Handle TimeSeries objects
+        if isinstance(obj, pynwb.base.TimeSeries):
+            if obj.timestamps is not None and len(obj.timestamps) > 0:
+                # Use first and last timestamps
+                start_times.append(float(obj.timestamps[0]))
+                end_times.append(float(obj.timestamps[-1]))
+            elif (
+                obj.starting_time is not None
+                and obj.rate is not None
+                and obj.data is not None
+            ):
+                # Calculate start and end time
+                start_times.append(float(obj.starting_time))
+                num_samples = len(obj.data)
+                if obj.rate == 0:
+                    continue
+                end_times.append(float(obj.starting_time + (num_samples / obj.rate)))
+
+        # Handle DynamicTable objects with time columns
+        elif isinstance(obj, hdmf.common.DynamicTable):
+            # Handle start_time and stop_time columns (e.g., trials)
+            if "start_time" in obj.colnames and len(obj["start_time"]):
+                start_times.append(float(obj["start_time"][0]))
+            if "stop_time" in obj.colnames and len(obj["stop_time"]):
+                end_times.append(float(obj["stop_time"][-1]))
+
+            # Handle spike_times column (e.g., Units table)
+            # Assume spike times are ordered within each unit
+            # Read only the first and last spike time from each unit
+            if "spike_times" in obj.colnames and len(obj["spike_times"]):
+                idxs = obj["spike_times"].data[:]
+
+                # handle bug if the first unit has no spikes
+                if idxs[0] == 0:
+                    idxs = idxs[1:]
+
+                st_data = obj["spike_times"].target
+
+                if len(idxs) > 1:
+                    start = float(np.min(np.r_[st_data[0], st_data[idxs[:-1]]]))
+                else:
+                    start = float(st_data[0])
+
+                end = float(np.max(st_data[idxs - 1]))
+                start_times.append(float(start))
+                end_times.append(float(end))
+
+            # Handle timestamp column (e.g., EventsTable)
+            if "timestamp" in obj.colnames and len(obj["timestamp"]):
+                timestamp_data = obj["timestamp"]
+                start_times.append(float(timestamp_data[0]))
+                # Check if duration column exists to calculate end times
+                if "duration" in obj.colnames:
+                    duration_data = obj["duration"]
+                    end_times.append(float(timestamp_data[-1] + duration_data[-1]))
+                else:
+                    # No duration, use max timestamp as end
+                    end_times.append(float(timestamp_data[-1]))
+
+    # Return duration as max - min
+    if start_times and end_times:
+        duration = max(end_times) - min(start_times)
+        if (
+            duration < 3600 * 24 * 365 * 5
+        ):  # if duration is over 5 years, something went wrong
+            return duration
+        else:
+            lgr.warning(
+                "Session duration of %.2f seconds (%.2f years) exceeds 5-year limit; "
+                "returning None as this likely indicates an error in timestamps",
+                duration,
+                duration / (3600 * 24 * 365),
+            )
+    return None
+
+
 def _get_image_series(nwb: pynwb.NWBFile) -> list[dict]:
     """Retrieves all ImageSeries related metadata from an open nwb file.
 
diff --git a/dandi/tests/test_metadata.py b/dandi/tests/test_metadata.py
@@ -28,7 +28,10 @@
 )
 from dandischema.models import Dandiset as DandisetMeta
 from dateutil.tz import tzutc
+from hdmf.common import DynamicTable
+import numpy as np
 from pydantic import ByteSize
+from pynwb import NWBHDF5IO, NWBFile, TimeSeries
 import pytest
 import requests
 from semantic_version import Version
@@ -471,6 +474,239 @@ def test_time_extract_gest() -> None:
     )
 
 
+@pytest.mark.ai_generated
+def test_session_duration_extraction(tmp_path: Path) -> None:
+    """Test that session duration is extracted and included in Session activity"""
+    # Create a test NWB file with TimeSeries data
+    nwb_path = tmp_path / "test_duration.nwb"
+    session_start = datetime(2020, 1, 1, 12, 0, 0, tzinfo=tzutc())
+
+    nwbfile = NWBFile(
+        session_description="test session for duration",
+        identifier="test_duration_123",
+        session_start_time=session_start,
+    )
+
+    # Add a TimeSeries that spans 100 seconds (timestamps from 0 to 100)
+    data = np.random.rand(1000)
+    timestamps = np.linspace(0, 100, 1000)
+    ts1 = TimeSeries(name="timeseries1", data=data, unit="volts", timestamps=timestamps)
+    nwbfile.add_acquisition(ts1)
+
+    # Add another TimeSeries using starting_time and rate
+    # This one goes from 50s to 150s (100 samples at 1 Hz)
+    data2 = np.random.rand(100)
+    ts2 = TimeSeries(
+        name="timeseries2", data=data2, unit="volts", starting_time=50.0, rate=1.0
+    )
+    nwbfile.add_acquisition(ts2)
+
+    # Write the file
+    with NWBHDF5IO(str(nwb_path), "w") as io:
+        io.write(nwbfile)
+
+    # Extract metadata
+    from ..metadata.nwb import get_metadata, nwb2asset
+
+    metadata = get_metadata(nwb_path)
+
+    # Check that session_end_time was calculated
+    assert "session_start_time" in metadata
+    assert "session_end_time" in metadata
+
+    # Calculate duration - should be 150 seconds (max) - 0 seconds (min)
+    duration = (
+        metadata["session_end_time"] - metadata["session_start_time"]
+    ).total_seconds()
+    assert abs(duration - 150.0) < 1.0  # Allow small floating point errors
+
+    # Check that Session activity includes endDate
+    asset = nwb2asset(nwb_path, digest=DUMMY_DANDI_ETAG)
+    assert asset.wasGeneratedBy is not None
+
+    # Find Session activities
+    sessions = [act for act in asset.wasGeneratedBy if act.schemaKey == "Session"]
+    assert len(sessions) > 0
+
+    session = sessions[0]
+    assert session.startDate is not None
+    assert session.endDate is not None
+    assert session.startDate == metadata["session_start_time"]
+    assert session.endDate == metadata["session_end_time"]
+
+
+@pytest.mark.ai_generated
+def test_session_duration_with_trials(tmp_path: Path) -> None:
+    """Test that session duration includes trials table timestamps"""
+    # Create a test NWB file with trials
+    nwb_path = tmp_path / "test_duration_trials.nwb"
+    session_start = datetime(2020, 1, 1, 12, 0, 0, tzinfo=tzutc())
+
+    nwbfile = NWBFile(
+        session_description="test session with trials",
+        identifier="test_trials_123",
+        session_start_time=session_start,
+    )
+
+    # Add a TimeSeries that spans from 10 to 50 seconds
+    data = np.random.rand(400)
+    timestamps = np.linspace(10, 50, 400)
+    ts = TimeSeries(name="timeseries1", data=data, unit="volts", timestamps=timestamps)
+    nwbfile.add_acquisition(ts)
+
+    # Add trials that extend the session to 200 seconds
+    nwbfile.add_trial_column(
+        name="correct", description="whether the trial was correct"
+    )
+    nwbfile.add_trial(start_time=5.0, stop_time=15.0, correct=True)
+    nwbfile.add_trial(start_time=20.0, stop_time=30.0, correct=False)
+    nwbfile.add_trial(start_time=100.0, stop_time=200.0, correct=True)
+
+    # Write the file
+    with NWBHDF5IO(str(nwb_path), "w") as io:
+        io.write(nwbfile)
+
+    # Extract metadata
+    from ..metadata.nwb import get_metadata, nwb2asset
+
+    metadata = get_metadata(nwb_path)
+
+    # Check that session_end_time was calculated
+    assert "session_start_time" in metadata
+    assert "session_end_time" in metadata
+
+    # Calculate duration - should be 200 (max from trials) - 5 (min from trials) = 195 seconds
+    duration = (
+        metadata["session_end_time"] - metadata["session_start_time"]
+    ).total_seconds()
+    assert abs(duration - 195.0) < 1.0  # Allow small floating point errors
+
+    # Check that Session activity includes endDate
+    asset = nwb2asset(nwb_path, digest=DUMMY_DANDI_ETAG)
+    assert asset.wasGeneratedBy is not None
+
+    # Find Session activities
+    sessions = [act for act in asset.wasGeneratedBy if act.schemaKey == "Session"]
+    assert len(sessions) > 0
+
+    session = sessions[0]
+    assert session.startDate is not None
+    assert session.endDate is not None
+    assert session.startDate == metadata["session_start_time"]
+    assert session.endDate == metadata["session_end_time"]
+
+
+@pytest.mark.ai_generated
+def test_session_duration_with_units(tmp_path: Path) -> None:
+    """Test that session duration includes spike_times from Units table"""
+    # Create a test NWB file with Units table
+    nwb_path = tmp_path / "test_duration_units.nwb"
+    session_start = datetime(2020, 1, 1, 12, 0, 0, tzinfo=tzutc())
+
+    nwbfile = NWBFile(
+        session_description="test session with units",
+        identifier="test_units_123",
+        session_start_time=session_start,
+    )
+
+    # Add a simple TimeSeries that spans from 10 to 30 seconds
+    data = np.random.rand(200)
+    timestamps = np.linspace(10, 30, 200)
+    ts = TimeSeries(name="timeseries1", data=data, unit="volts", timestamps=timestamps)
+    nwbfile.add_acquisition(ts)
+
+    # Add Units with spike_times that extend session to 250 seconds
+    # Unit 1: spikes from 5s to 100s
+    # Unit 2: spikes from 50s to 250s
+    nwbfile.add_unit(spike_times=np.array([5.0, 10.0, 20.0, 50.0, 100.0]))
+    nwbfile.add_unit(spike_times=np.array([50.0, 100.0, 150.0, 200.0, 250.0]))
+
+    # Write the file
+    with NWBHDF5IO(str(nwb_path), "w") as io:
+        io.write(nwbfile)
+
+    # Extract metadata
+    from ..metadata.nwb import get_metadata
+
+    metadata = get_metadata(nwb_path)
+
+    # Check that session_end_time was calculated
+    assert "session_start_time" in metadata
+    assert "session_end_time" in metadata
+
+    # Duration should be 250 (max spike) - 5 (min spike) = 245 seconds
+    duration = (
+        metadata["session_end_time"] - metadata["session_start_time"]
+    ).total_seconds()
+    assert abs(duration - 245.0) < 1.0  # Allow small floating point errors
+
+
+@pytest.mark.ai_generated
+def test_session_duration_with_events(tmp_path: Path) -> None:
+    """Test that session duration includes timestamp/duration from DynamicTable"""
+    # Create a test NWB file with a DynamicTable containing timestamp and duration
+    nwb_path = tmp_path / "test_duration_events.nwb"
+    session_start = datetime(2020, 1, 1, 12, 0, 0, tzinfo=tzutc())
+
+    nwbfile = NWBFile(
+        session_description="test session with events",
+        identifier="test_events_123",
+        session_start_time=session_start,
+    )
+
+    # Add a simple TimeSeries that spans from 5 to 20 seconds
+    data = np.random.rand(150)
+    timestamps = np.linspace(5, 20, 150)
+    ts = TimeSeries(name="timeseries1", data=data, unit="volts", timestamps=timestamps)
+    nwbfile.add_acquisition(ts)
+
+    # Create a DynamicTable with timestamp and duration columns (similar to EventsTable)
+
+    events_table = DynamicTable(
+        name="events",
+        description="test events with timestamps and durations",
+    )
+    events_table.add_column(
+        name="timestamp",
+        description="event timestamps",
+    )
+    events_table.add_column(
+        name="duration",
+        description="event durations",
+    )
+
+    # Add events: event at 3s lasting 2s (ends at 5s)
+    #             event at 100s lasting 80s (ends at 180s)
+    events_table.add_row(timestamp=3.0, duration=2.0)
+    events_table.add_row(timestamp=100.0, duration=30.0)
+    events_table.add_row(timestamp=150.0, duration=10.0)
+
+    # Add the table to a processing module
+    processing_module = nwbfile.create_processing_module(
+        name="behavior", description="behavioral data"
+    )
+    processing_module.add(events_table)
+
+    # Write the file
+    with NWBHDF5IO(str(nwb_path), "w") as io:
+        io.write(nwbfile)
+
+    # Extract metadata
+    from ..metadata.nwb import get_metadata
+
+    metadata = get_metadata(nwb_path)
+
+    # Check that session_end_time was calculated
+    assert "session_start_time" in metadata
+    assert "session_end_time" in metadata
+
+    # Duration should be 180 (100 + 80, max end) - 3 (min timestamp) = 177 seconds
+    duration = (
+        metadata["session_end_time"] - metadata["session_start_time"]
+    ).total_seconds()
+    assert abs(duration - 157.0) < 1.0  # Allow small floating point errors
+
+
 @mark_xfail_ontobee
 @mark.skipif_no_network
 @pytest.mark.obolibrary

Original file line number	Diff line number	Diff line change
`@@ -588,6 +588,7 @@ def extract_session(metadata: dict) -> list[models.Session] \| None:`
`588`	`588`	`name=session_id or "Acquisition session",`
`589`	`589`	`description=metadata.get("session_description"),`
`590`	`590`	`startDate=metadata.get("session_start_time"),`
	`591`	`+ endDate=metadata.get("session_end_time"),`
`591`	`592`	`used=probes,`
`592`	`593`	`)`
`593`	`594`	`]`