LernerLab
diff --git a/‎src/guppy/extractors/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎src/guppy/extractors/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/guppy/extractors/base_recording_extractor.py‎
Lines changed: 139 additions & 0 deletions b/‎src/guppy/extractors/base_recording_extractor.py‎
Lines changed: 139 additions & 0 deletions
diff --git a/‎src/guppy/extractors/csv_recording_extractor.py‎
Lines changed: 193 additions & 0 deletions b/‎src/guppy/extractors/csv_recording_extractor.py‎
Lines changed: 193 additions & 0 deletions
@@ -0,0 +1,5 @@
+from .base_recording_extractor import BaseRecordingExtractor, read_and_save_event, read_and_save_all_events
+from .tdt_recording_extractor import TdtRecordingExtractor
+from .csv_recording_extractor import CsvRecordingExtractor
+from .doric_recording_extractor import DoricRecordingExtractor
+from .npm_recording_extractor import NpmRecordingExtractor
@@ -0,0 +1,139 @@
+"""Base class for recording extractors."""
+
+import logging
+import multiprocessing as mp
+import os
+import time
+from abc import ABC, abstractmethod
+from itertools import repeat
+from typing import Any
+
+import h5py
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+class BaseRecordingExtractor(ABC):
+    """
+    Abstract base class for recording extractors.
+
+    Defines the interface contract for reading and saving fiber photometry
+    data from various acquisition formats (TDT, Doric, CSV, NPM, etc.).
+    """
+
+    @classmethod
+    @abstractmethod
+    def discover_events_and_flags(cls) -> tuple[list[str], list[str]]:
+        """
+        Discover available events and format flags from data files.
+
+        Returns
+        -------
+        events : list of str
+            Names of all events/stores available in the dataset.
+        flags : list of str
+            Format indicators or file type flags.
+        """
+        # NOTE: This method signature is intentionally minimal and flexible.
+        # Different formats have different discovery requirements:
+        # - TDT/CSV/Doric: need only folder_path parameter
+        # - NPM: needs folder_path, num_ch, and optional inputParameters for interleaved channels
+        # Each child class defines its own signature with the parameters it needs.
+        pass
+
+    @abstractmethod
+    def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]:
+        """
+        Read data from source files for specified events.
+
+        Parameters
+        ----------
+        events : list of str
+            List of event/store names to extract from the data.
+        outputPath : str
+            Path to the output directory.
+
+        Returns
+        -------
+        list of dict
+            List of dictionaries containing extracted data. Each dictionary
+            represents one event/store and contains keys such as 'storename',
+            'timestamps', 'data', 'sampling_rate', etc.
+        """
+        pass
+
+    @abstractmethod
+    def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None:
+        """
+        Save extracted data dictionaries to HDF5 format.
+
+        Parameters
+        ----------
+        output_dicts : list of dict
+            List of data dictionaries from read().
+        outputPath : str
+            Path to the output directory.
+        """
+        pass
+
+    @staticmethod
+    def _write_hdf5(data: Any, storename: str, output_path: str, key: str) -> None:
+        """
+        Write data to HDF5 file.
+
+        Parameters
+        ----------
+        data : array-like
+            Data to write to the HDF5 file.
+        storename : str
+            Name of the store/event.
+        output_path : str
+            Directory path where HDF5 file will be written.
+        key : str
+            Key name for this data field in the HDF5 file.
+        """
+        # Replace invalid characters in storename to avoid filesystem errors
+        storename = storename.replace("\\", "_")
+        storename = storename.replace("/", "_")
+
+        filepath = os.path.join(output_path, storename + ".hdf5")
+
+        # Create new file if it doesn't exist
+        if not os.path.exists(filepath):
+            with h5py.File(filepath, "w") as f:
+                if isinstance(data, np.ndarray):
+                    f.create_dataset(key, data=data, maxshape=(None,), chunks=True)
+                else:
+                    f.create_dataset(key, data=data)
+        # Append to existing file
+        else:
+            with h5py.File(filepath, "r+") as f:
+                if key in list(f.keys()):
+                    if isinstance(data, np.ndarray):
+                        f[key].resize(data.shape)
+                        arr = f[key]
+                        arr[:] = data
+                    else:
+                        arr = f[key]
+                        arr[()] = data
+                else:
+                    if isinstance(data, np.ndarray):
+                        f.create_dataset(key, data=data, maxshape=(None,), chunks=True)
+                    else:
+                        f.create_dataset(key, data=data)
+
+
+def read_and_save_event(extractor, event, outputPath):
+    output_dicts = extractor.read(events=[event], outputPath=outputPath)
+    extractor.save(output_dicts=output_dicts, outputPath=outputPath)
+    logger.info("Data for event {} fetched and stored.".format(event))
+
+
+def read_and_save_all_events(extractor, events, outputPath, numProcesses=mp.cpu_count()):
+    logger.info("Reading data for event {} ...".format(events))
+
+    start = time.time()
+    with mp.Pool(numProcesses) as p:
+        p.starmap(read_and_save_event, zip(repeat(extractor), events, repeat(outputPath)))
+    logger.info("Time taken = {0:.5f}".format(time.time() - start))
@@ -0,0 +1,193 @@
+import glob
+import logging
+import os
+from typing import Any
+
+import numpy as np
+import pandas as pd
+
+from guppy.extractors import BaseRecordingExtractor
+
+logger = logging.getLogger(__name__)
+
+
+class CsvRecordingExtractor(BaseRecordingExtractor):
+
+    @classmethod
+    def discover_events_and_flags(cls, folder_path) -> tuple[list[str], list[str]]:
+        """
+        Discover available events and format flags from CSV files.
+
+        Parameters
+        ----------
+        folder_path : str
+            Path to the folder containing CSV files.
+
+        Returns
+        -------
+        events : list of str
+            Names of all events/stores available in the dataset.
+        flags : list of str
+            Format indicators or file type flags.
+        """
+        logger.debug("If it exists, importing either NPM or Doric or csv file based on the structure of file")
+        path = sorted(glob.glob(os.path.join(folder_path, "*.csv")))
+
+        path = sorted(list(set(path)))
+        flag = "None"
+        event_from_filename = []
+        flag_arr = []
+        for i in range(len(path)):
+            ext = os.path.basename(path[i]).split(".")[-1]
+            assert ext == "csv", "Only .csv files are supported by import_csv function."
+            df = pd.read_csv(path[i], header=None, nrows=2, index_col=False, dtype=str)
+            df = df.dropna(axis=1, how="all")
+            df_arr = np.array(df).flatten()
+            check_all_str = []
+            for element in df_arr:
+                try:
+                    float(element)
+                except:
+                    check_all_str.append(i)
+            assert len(check_all_str) != len(
+                df_arr
+            ), "This file appears to be doric .csv. This function only supports standard .csv files."
+            df = pd.read_csv(path[i], index_col=False)
+
+            _, value = cls._check_header(df)
+
+            # check dataframe structure and read data accordingly
+            if len(value) > 0:
+                columns_isstr = False
+                df = pd.read_csv(path[i], header=None)
+                cols = np.array(list(df.columns), dtype=str)
+            else:
+                df = df
+                columns_isstr = True
+                cols = np.array(list(df.columns), dtype=str)
+            # check the structure of dataframe and assign flag to the type of file
+            if len(cols) == 1:
+                if cols[0].lower() != "timestamps":
+                    logger.error("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m")
+                    raise Exception("\033[1m" + "Column name should be timestamps (all lower-cases)" + "\033[0m")
+                else:
+                    flag = "event_csv"
+            elif len(cols) == 3:
+                arr1 = np.array(["timestamps", "data", "sampling_rate"])
+                arr2 = np.char.lower(np.array(cols))
+                if (np.sort(arr1) == np.sort(arr2)).all() == False:
+                    logger.error(
+                        "\033[1m"
+                        + "Column names should be timestamps, data and sampling_rate (all lower-cases)"
+                        + "\033[0m"
+                    )
+                    raise Exception(
+                        "\033[1m"
+                        + "Column names should be timestamps, data and sampling_rate (all lower-cases)"
+                        + "\033[0m"
+                    )
+                else:
+                    flag = "data_csv"
+            elif len(cols) == 2:
+                raise ValueError(
+                    "Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data."
+                )
+            elif len(cols) >= 2:
+                raise ValueError(
+                    "Data appears to be Neurophotometrics csv. Please use import_npm_csv function to import the data."
+                )
+            else:
+                logger.error("Number of columns in csv file does not make sense.")
+                raise Exception("Number of columns in csv file does not make sense.")
+
+            if columns_isstr == True and (
+                "flags" in np.char.lower(np.array(cols)) or "ledstate" in np.char.lower(np.array(cols))
+            ):
+                flag = flag + "_v2"
+            else:
+                flag = flag
+
+            flag_arr.append(flag)
+            logger.info(flag)
+            assert (
+                flag == "event_csv" or flag == "data_csv"
+            ), "This function only supports standard event_csv and data_csv files."
+            name = os.path.basename(path[i]).split(".")[0]
+            event_from_filename.append(name)
+
+        logger.info("Importing of csv file is done.")
+        return event_from_filename, flag_arr
+
+    def __init__(self, folder_path):
+        self.folder_path = folder_path
+
+    @staticmethod
+    def _check_header(df):
+        arr = list(df.columns)
+        check_float = []
+        for i in arr:
+            try:
+                check_float.append(float(i))
+            except:
+                pass
+
+        return arr, check_float
+
+    def _read_csv(self, event):
+        logger.debug("\033[1m" + "Trying to read data for {} from csv file.".format(event) + "\033[0m")
+        if not os.path.exists(os.path.join(self.folder_path, event + ".csv")):
+            logger.error("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m")
+            raise Exception("\033[1m" + "No csv file found for event {}".format(event) + "\033[0m")
+
+        df = pd.read_csv(os.path.join(self.folder_path, event + ".csv"), index_col=False)
+        return df
+
+    def _save_to_hdf5(self, df, event, outputPath):
+        key = list(df.columns)
+
+        # TODO: clean up these if branches
+        if len(key) == 3:
+            arr1 = np.array(["timestamps", "data", "sampling_rate"])
+            arr2 = np.char.lower(np.array(key))
+            if (np.sort(arr1) == np.sort(arr2)).all() == False:
+                logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m")
+                raise Exception("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m")
+
+        if len(key) == 1:
+            if key[0].lower() != "timestamps":
+                logger.error("\033[1m" + "Column names should be timestamps, data and sampling_rate" + "\033[0m")
+                raise Exception("\033[1m" + "Column name should be timestamps" + "\033[0m")
+
+        if len(key) != 3 and len(key) != 1:
+            logger.error(
+                "\033[1m"
+                + "Number of columns in csv file should be either three or one. Three columns if \
+                            the file is for control or signal data or one column if the file is for event TTLs."
+                + "\033[0m"
+            )
+            raise Exception(
+                "\033[1m"
+                + "Number of columns in csv file should be either three or one. Three columns if \
+                            the file is for control or signal data or one column if the file is for event TTLs."
+                + "\033[0m"
+            )
+
+        for i in range(len(key)):
+            self._write_hdf5(df[key[i]].dropna(), event, outputPath, key[i].lower())
+
+        logger.info("\033[1m" + "Reading data for {} from csv file is completed.".format(event) + "\033[0m")
+
+    def read(self, *, events: list[str], outputPath: str) -> list[dict[str, Any]]:
+        output_dicts = []
+        for event in events:
+            df = self._read_csv(event=event)
+            S = df.to_dict()
+            S["storename"] = event
+            output_dicts.append(S)
+        return output_dicts
+
+    def save(self, *, output_dicts: list[dict[str, Any]], outputPath: str) -> None:
+        for S in output_dicts:
+            event = S.pop("storename")
+            df = pd.DataFrame.from_dict(S)
+            self._save_to_hdf5(df=df, event=event, outputPath=outputPath)