KumarLabJax
diff --git a/‎README.md‎
Lines changed: 13 additions & 1 deletion b/‎README.md‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/jabs_postprocess/utils/project_utils.py‎
Lines changed: 60 additions & 8 deletions b/‎src/jabs_postprocess/utils/project_utils.py‎
Lines changed: 60 additions & 8 deletions
@@ -121,7 +121,7 @@ Lots of the functions used in generating these behavior tables were designed for
 
 There are two behavior tables generated. Both contain a header line to store parameters used while calling the script.
 
-Some features are optional, because calculating them can be expensive. These options are noted with an asterisk (\*). While default behavior is to include them, they are not guaranteed.
+Some features are optional, because calculating them can be expensive or are controlled via optional arguments. These options are noted with an asterisk (\*). While default behavior is to include them, they are not guaranteed.
 
 ## Header Data
 
@@ -150,6 +150,11 @@ The bout table contains a compressed RLE encoded format for each bout (post-filt
     * `0` : Not behavior prediction
     * `1` : Behavior prediction
 * `distance`\* : Distance traveled during bout
+* `total_bout_count`\* : Number of behavior bouts per animal
+* `avg_bout_duration`\* : Average bout all duration per animal
+* `bout_duration_std`\* : Standard deviation of all bout durations
+* `bout_duration_var`\* : Variance of all bout durations
+* `latency_to_first_bout`\* : Frame number of first behavior bout
 
 ## Binned Table
 
@@ -168,6 +173,13 @@ Summaries included:
     * If a bout spans multiple time bins, it will be divided into both via the proportion of time
     * Sum of bouts across bins produces the correct total count
     * Note that bouts cannot span between video files
+* `_stats_sample_count` : Sample count used in stats calculation (count of whole and partial bouts in time bin)
+* `avg_bout_duration` : Average bout duration per animal (in time bin)
+* `bout_duration_std` : Standard deviation of bout durations (in time bin)
+* `bout_duration_var` : Variance of bout durations (in time bin)
+* `latency_to_first_prediction` : Frame number of first behavior prediction in the time bin
+    * Frame is relative to the experiment start, not the time bin
+* `latency_to_last_prediction` : Frame number of last behavior prediction in the time bin
 * `not_behavior_dist`\* : Total distance traveled during not behavior bouts
 * `behavior_dist`\* : Total distance traveled during behavior bouts
 
 
@@ -1,6 +1,6 @@
 [project]
 name = "jabs-postprocess"
-version = "0.4.2"
+version = "0.5.0"
 description = "A python library for JABS postprocessing utilities."
 readme = "README.md"
 license = "LicenseRef-PLATFORM-LICENSE-AGREEMENT-FOR-NON-COMMERCIAL-USE"
 
@@ -802,7 +802,6 @@ def add_bout_statistics(self):
             - bout_duration_var: Variance of bout durations for this animal
             - latency_to_first_bout: Frame number of first behavior bout (if any)
         """
-
         # Group by animal and calculate statistics for behavior bouts only
         behavior_bouts = self._data[self._data["is_behavior"] == 1]
 
@@ -902,7 +901,9 @@ def bouts_to_bins(
                 Binned event data describing the event data.
 
         Notes:
-                Binned data describes event data as summaries. For each state, total time and distance travelled are provided. Additionally, the number of behavior events are counted.
+                Binned data describes event data as summaries.
+                For each state, total time and distance travelled are provided.
+                Additionally, the number of behavior events are counted.
                 Events that span multiple bins are split between them based on the percent in each, allowing fractional bout counts.
         """
         # Get the range that the experiment spans
@@ -1013,12 +1014,57 @@ def bouts_to_bins(
             results["time_not_behavior"] = bins_to_summarize.loc[
                 bins_to_summarize["is_behavior"] == 0, "duration"
             ].sum()
-            results["time_behavior"] = bins_to_summarize.loc[
-                bins_to_summarize["is_behavior"] == 1, "duration"
-            ].sum()
-            results["bout_behavior"] = len(
-                bins_to_summarize.loc[bins_to_summarize["is_behavior"] == 1]
-            )
+
+            # Lots of "behavior" stats are run, so separate them for convenience
+            behavior_bins = bins_to_summarize.loc[bins_to_summarize["is_behavior"] == 1]
+
+            results["time_behavior"] = behavior_bins["duration"].sum()
+            results["bout_behavior"] = behavior_bins["percent_bout"].sum()
+            results["_stats_sample_count"] = len(behavior_bins)
+            # We use a weighted statistic definitions here
+            # Weights are the proportion of bout contained in the bin (percent_bout)
+            if results["bout_behavior"] > 0:
+                results["avg_bout_duration"] = (
+                    np.sum(
+                        behavior_bins["duration"].values
+                        * behavior_bins["percent_bout"].values
+                    )
+                    / results["bout_behavior"]
+                )
+                results["latency_to_first_prediction"] = behavior_bins["start"].min()
+                results["latency_to_last_prediction"] = (
+                    behavior_bins["start"] + behavior_bins["duration"]
+                ).max()
+
+                # Variance requires more than one effective bout
+                if len(behavior_bins) > 1:
+                    denom = (
+                        (len(behavior_bins) - 1)
+                        * results["bout_behavior"]
+                        / len(behavior_bins)
+                    )
+                    results["bout_duration_var"] = (
+                        np.sum(
+                            behavior_bins["percent_bout"].values
+                            * np.square(
+                                behavior_bins["duration"].values
+                                / behavior_bins["percent_bout"].values
+                                - results["avg_bout_duration"]
+                            )
+                        )
+                        / denom
+                    )
+                    results["bout_duration_std"] = np.sqrt(results["bout_duration_var"])
+                else:
+                    results["bout_duration_var"] = np.nan
+                    results["bout_duration_std"] = np.nan
+            else:
+                # No behavior data - set all defaults
+                results["avg_bout_duration"] = np.nan
+                results["bout_duration_var"] = np.nan
+                results["bout_duration_std"] = np.nan
+                results["latency_to_first_prediction"] = np.nan
+                results["latency_to_last_prediction"] = np.nan
             if "distance" in bins_to_summarize.keys():
                 results["not_behavior_dist"] = bins_to_summarize.loc[
                     bins_to_summarize["is_behavior"] == 0, "calc_dist"
@@ -1107,6 +1153,12 @@ def __init__(self, settings: ClassifierSettings, data: pd.DataFrame):
             "time",
             "not_behavior_dist",
             "behavior_dist",
+            "avg_bout_duration",
+            "_stats_sample_count",
+            "bout_duration_std",
+            "bout_duration_var",
+            "latency_to_first_prediction",
+            "latency_to_last_prediction",
         ]
         self._check_fields()