DFKI-SignLanguage
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎PostProcessing/PostProcessVideos.py‎
Lines changed: 164 additions & 0 deletions b/‎PostProcessing/PostProcessVideos.py‎
Lines changed: 164 additions & 0 deletions
diff --git a/‎PostProcessing/README.md‎
Lines changed: 23 additions & 0 deletions b/‎PostProcessing/README.md‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎PostProcessing/dataframes.py‎
Lines changed: 123 additions & 0 deletions b/‎PostProcessing/dataframes.py‎
Lines changed: 123 additions & 0 deletions
@@ -42,3 +42,7 @@ opencamera-extended-firebase-adminsdk-yv5yz-e33a8ce5c1.json
 # OS
 .DS_Store
 
+
+#
+# Python stuff
+*.pyc
@@ -0,0 +1,164 @@
+import argparse
+from pathlib import Path
+from typing import List, Tuple
+import tempfile
+
+import pandas as pd
+import re
+
+from dataframes import compute_time_range, trim_repaired_into_interval
+from dataframes import repair_dropped_frames, compute_time_step
+
+from video import extract_frames
+from video import rebuild_video
+
+
+THRESHOLD_NS = 10 * 1000 * 1000  # millis * micros * nanos
+
+
+def scan_session_dir(input_dir: Path) -> Tuple[List[str], List[pd.DataFrame], List[str]]:
+    #
+    # Find all CSV files in the directory and read it into a data frame
+    # Use the following regular expression to check of the client ID is a 16-digit hexadecimal.
+    clientIDpattern = "[\\da-f]" * 16
+    patt = re.compile("^" + clientIDpattern + "$")
+
+    # Fill this list with the client IDs found n the directory
+    clientIDs: List[str] = []
+    for p in input_dir.iterdir():
+        # Check if the ClientID complies to the numerical format (using regex).
+        res = patt.match(p.stem)
+        if res:
+            print("Found client -->", p.stem)
+            clientIDs.append(p.stem)
+        else:
+            print("Discarding ", p.stem)
+
+    #
+    # Accumulates the list of dataframes and mp4 files in the same order of the client IDs.
+    df_list: List[pd.DataFrame] = []
+    mp4_list: List[str] = []
+
+    for cID in clientIDs:
+        client_dir = input_dir / cID
+        CSVs = list(client_dir.glob("*.csv"))
+        MP4s = list(client_dir.glob("*.mp4"))
+        #
+        # Consistency check. Each clientID folder must have exactly 1 CSV and 1 mp4.
+        if len(CSVs) != 1:
+            raise Exception(f"Expecting 1 CSV file for client {cID}. Found {len(CSVs)}.")
+
+        if len(MP4s) != 1:
+            raise Exception(f"Expecting 1 MP4 file for client {cID}. Found {len(MP4s)}.")
+
+        csv_file = CSVs[0]
+        mp4_file = MP4s[0]
+
+        df: pd.DataFrame = pd.read_csv(csv_file, header=None)
+
+        df_list.append(df)
+        mp4_list.append(str(mp4_file))
+
+    return clientIDs, df_list, mp4_list
+
+
+#
+#
+#
+def main(input_dir: Path, output_dir: Path):
+
+    print(f"Scanning dir {str(input_dir)}...")
+    clientIDs, df_list, mp4_list = scan_session_dir(input_dir)
+
+    n_clients = len(clientIDs)
+
+
+    #
+    # Print collected info
+    for i in range(n_clients):
+        cID = clientIDs[i]
+        df = df_list[i]
+        mp4 = mp4_list[i]
+        print(f"For client ID {cID}: {len(df)} frames for file {mp4}")
+
+    #
+    # Repair CSVs
+    repaired_df_list: List[pd.DataFrame] = []
+    for cID, df in zip(clientIDs, df_list):
+        time_step = compute_time_step(df)
+        repaired_df = repair_dropped_frames(df=df, time_step=time_step)
+        repaired_df_list.append(repaired_df)
+
+    assert len(clientIDs) == len(df_list) == len(mp4_list) == len(repaired_df_list)
+
+    #
+    # Trim CSVs
+    # Find time ranges
+    min_common, max_common = compute_time_range(repaired_df_list)
+    # Trim the data frames to the time range
+    trimmed_dataframes = trim_repaired_into_interval(repaired_df_list, min_common, max_common, THRESHOLD_NS)
+
+    assert len(clientIDs) == len(trimmed_dataframes), f"Expected {len(clientIDs)} trimmed dataframes. Found f{len(trimmed_dataframes)}"
+
+    # Check that all the resulting dataframes have the same number of rows
+    client0ID = clientIDs[0]
+    client0size = len(trimmed_dataframes[0])
+    print(f"For client {client0ID}: {client0size} frames")
+    for cID, df in zip(clientIDs[1:], trimmed_dataframes[1:]):
+        dfsize = len(df)
+        if client0size != dfsize:
+            raise Exception(f"For client {cID}: expecting {client0size} frames, found {dfsize}")
+
+    print("Good. All trimmed dataframes have the same number of entries.")
+
+    #
+    # Unpack the original videos, and repack them according to repaired and trimmed dataframes.
+    for i, cID in enumerate(clientIDs):
+        orig_df = df_list[i]
+        trimmed_df = trimmed_dataframes[i]
+        video_file = mp4_list[i]
+        # Create a temporary directory for frames unpacking
+        with tempfile.TemporaryDirectory(prefix="RecSyncNG", suffix=cID) as tmp_dir:
+            # Extract the frames from the original videos
+            # and rename the file names to the timestamps
+            print(f"Extracting {len(orig_df)} frames from '{video_file}'...")
+            extract_frames(video_file=video_file, timestamps_df=orig_df, output_dir=tmp_dir)
+
+            # Reconstruct videos
+            video_out_filepath = output_dir / (cID + ".mp4")
+            rebuild_video(dir=Path(tmp_dir), frames=trimmed_df, outfile=video_out_filepath)
+            # And save also the CSV
+            csv_out_filepath = video_out_filepath.with_suffix(".csv")
+            trimmed_df.to_csv(path_or_buf=csv_out_filepath, header=True, index=False)
+
+
+#
+# MAIN
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+        description="Fixes the videos produced by the RecSync recording sessions."
+                    "Output videos will have the same number of frames,"
+                    "with missing/dropped frames inserted as (black) artificial data."
+    )
+    parser.add_argument(
+        "--infolder", "-i", type=str, help="The folder containing the collected videos and CSV files with the timestamps.",
+        required=True
+    )
+    parser.add_argument(
+        "--outfolder", "-o", type=str, help="The folder where the repaired and aligned frames will be stored.",
+        required=True
+    )
+
+    args = parser.parse_args()
+
+    infolder = Path(args.infolder)
+    outfolder = Path(args.outfolder)
+
+    if not infolder.exists():
+        raise Exception(f"Input folder '{infolder}' doesn't exist.")
+
+    if not outfolder.exists():
+        raise Exception(f"Output folder '{outfolder}' doesn't exist.")
+
+    main(infolder, outfolder)
@@ -0,0 +1,23 @@
+# RecSync PostProcessing
+
+This utility "repairs" the videos generated by the RecSync NG app:
+* Videos are unpacked
+* Missing frames are injected as black frames
+* Videos are re-packed, and they will all hve the same starting frame and the same number of frames
+  * Optionally, a frame counter is added
+
+
+## Installing
+
+Create an environment and install the packages listed in the `requirements.txt`
+
+
+## Testing
+
+Export an environment variable with the root of your test material and run the `pytest` command.
+It is the directory containing the clients subdirs.
+
+```bash
+export RECSYNCH_SESSION_DIR=path/to/my/stuff
+pytest
+```
@@ -0,0 +1,123 @@
+import pandas as pd
+import numpy as np
+
+from typing import Tuple, List
+
+
+def compute_time_step(video_timestamps: pd.DataFrame) -> float:
+    """
+    Compute the time steps of a video based on its timestamps.
+
+    Parameters:
+    video_timestamps (pd.DataFrame): A pandas DataFrame containing timestamps of a video.
+
+    Returns:
+    float: The time step of the video time stamps.
+    """
+
+    first_col_name = video_timestamps.columns[0]
+    # Retrieves the most frequent time different between consecutive lines.
+    time_step = (video_timestamps[first_col_name].diff()).dropna().value_counts().index[0]
+
+    return time_step
+
+
+def repair_dropped_frames(df: pd.DataFrame, time_step: float) -> pd.DataFrame:
+    # The name of the first column (can be anythign as the original df doesn't have header
+    first_col_name = df.columns[0]
+
+    # Forces the type of the timestamps to int64
+    df[first_col_name] = pd.to_datetime(df[first_col_name]).astype(np.int64)
+    # Retrieves the timestamps into a Series
+    timestamps = df[first_col_name]
+    # Will accumulate the repaired rows
+    repaired_rows = []
+
+    # Check for missing timestamps and generate them
+    for i in range(len(timestamps) - 1):
+        timestamp = timestamps.iloc[i]
+        next_timestamp = timestamps.iloc[i + 1]
+
+        # The current timestamp is by definition original
+        repaired_rows.append([timestamp, 'Original'])
+
+        # If the next timestamp exceeds the expected time step
+        if next_timestamp - timestamp > time_step:
+            # Estimate the number of missing frames
+            missing_timestamps_count = round((next_timestamp - timestamp) / time_step) - 1
+            # Estimate a time interval between them (will be very similar to the input time_step
+            interval = (next_timestamp - timestamp) / (missing_timestamps_count + 1)
+            # Generate the missing lines
+            for j in range(1, missing_timestamps_count + 1):
+                new_timestamp = (timestamp + j * interval).astype(np.int64)
+                repaired_rows.append([new_timestamp, 'Generated'])
+
+    # Add the last row
+    repaired_rows.append([timestamps.iloc[-1], 'Original'])
+    # print(len(repaired_rows))
+
+    # Create a new DataFrame with repaired rows
+    columns = ['timestamp', 'generated']
+    output_df = pd.DataFrame(repaired_rows, columns=columns)
+    # Forces the output timestamp type to int 64
+    output_df['timestamp'] = pd.to_datetime(output_df['timestamp']).astype(np.int64)
+
+    return output_df
+
+
+def save_dataframes(dataframes, prefix='df') -> None:
+    # Generate filenames based on a pattern or numbering scheme
+    filenames = [f"{prefix}{i}.csv" for i in range(1, len(dataframes) + 1)]
+
+    # Save each DataFrame to a separate file
+    for i, df in enumerate(dataframes):
+        filename = filenames[i]
+        df.to_csv(filename, index=False, header=False)
+    print("DataFrames saved successfully.")
+
+
+# Function to find the largest value in the first entry of all dataframes
+def find_largest_first_entry(dfs):
+    largest_value = float('-inf')
+    for df in dfs:
+        first_entry = df.iloc[0, 0]
+        if first_entry > largest_value:
+            largest_value = first_entry
+    return largest_value
+
+
+# Function to find the smallest value in the last entry of selected dataframes
+def find_smallest_last_entry(dfs):
+    smallest_value = float('inf')
+    for df in dfs:
+        last_entry = df.iloc[-1, 0]
+        if last_entry < smallest_value:
+            smallest_value = last_entry
+    return smallest_value
+
+
+# Function to find the largest & smallest value in the first and last entry of dataframes
+def compute_time_range(dfs) -> Tuple[int, int]:
+    # Find the lowest and highest numbers in all the data frames
+    lower_value = find_largest_first_entry(dfs)
+    higher_value = find_smallest_last_entry(dfs)
+
+    # return the results
+    return lower_value, higher_value
+
+
+# Function to trim dataframes based on specified values
+def trim_repaired_into_interval(dfs, min_common, max_common, threshold) -> List[pd.DataFrame]:
+
+    trimmed_dataframes: List[pd.DataFrame] = []
+
+    lo_threshold = min_common - threshold
+    hi_threshold = max_common + threshold
+
+    for df in dfs:
+
+        selection_mask = df["timestamp"].between(lo_threshold, hi_threshold, inclusive='both')
+        trimmed_df = df[selection_mask]
+        trimmed_dataframes.append(trimmed_df)
+
+    return trimmed_dataframes