HBClab
diff --git a/‎code/data_processing/create_json.py‎
Lines changed: 116 additions & 0 deletions b/‎code/data_processing/create_json.py‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎code/group/group.py‎
Lines changed: 87 additions & 0 deletions b/‎code/group/group.py‎
Lines changed: 87 additions & 0 deletions
@@ -0,0 +1,116 @@
+import os 
+import pandas as pd
+import numpy as np
+
+
+
+def create_json(data_folder):
+    """
+    Constructs a master list of all tasks grouped by Subject ID at application startup.
+
+    Output:
+        Dictionary:
+            subject_id: {
+                site: str,
+                project: str,  # 'int' or 'obs'
+                tasks: {
+                    task_name: {
+                        date: str,
+                        category: str,
+                        png_paths: list
+                    }
+                }
+            }
+    """
+    directories = ['int', 'obs']
+    master_data = {}
+
+    for directory in directories:
+        dir_path = os.path.join(data_folder, directory)
+
+        for site in os.listdir(dir_path):  # Iterate over site folders (e.g., UI, NE)
+            site_path = os.path.join(dir_path, site)
+
+            if not os.path.isdir(site_path):
+                continue
+
+            for subject_id in os.listdir(site_path):  # Iterate over subject folders (e.g., 8006, 9002)
+                subject_path = os.path.join(site_path, subject_id)
+
+                if not os.path.isdir(subject_path):
+                    continue
+
+                # Initialize subject entry if not already in master_data
+                if subject_id not in master_data:
+                    master_data[subject_id] = {
+                        'site': site,
+                        'project': directory,
+                        'tasks': {}
+                    }
+
+                for task_name in os.listdir(subject_path):  # Iterate over task folders (e.g., AF, DSST)
+                    task_path = os.path.join(subject_path, task_name)
+
+                    if not os.path.isdir(task_path):
+                        continue
+
+                    plots_path = os.path.join(task_path, 'plot')
+                    data_path = os.path.join(task_path, 'data')
+
+                    # Initialize task entry if not already in tasks
+                    if task_name not in master_data[subject_id]['tasks']:
+                        master_data[subject_id]['tasks'][task_name] = {
+                            'date': None,
+                            'category': None,
+                            'png_paths': [],
+                            'session': None
+                        }
+
+                    # Extract date and category from CSV in data directory
+                    csv_file = [
+                        file for file in os.listdir(data_path)
+                        if file.endswith('.csv')
+                    ]
+                    if csv_file:
+                        csv_filename = csv_file[0]
+
+                        # Load the CSV into a DataFrame
+                        df = pd.read_csv(os.path.join(data_path, csv_filename))
+
+                        # Validate and extract the 'Date' column
+                        if 'datetime' in df.columns:
+                            date_value = df['datetime'].iloc[0]  # Extract the first value in the 'Date' column
+                        else:
+                            date_value = None  # Set to None or handle it as needed
+
+                        # Extract the category from the filename
+                        category_value = csv_filename.split('_')[-1].replace('.csv', '').replace('cat-', '')
+                        session_value = csv_filename.split('_')[-2].replace('ses-', '')
+
+                        # Update master_data
+                        master_data[subject_id]['tasks'][task_name]['date'] = date_value
+                        master_data[subject_id]['tasks'][task_name]['category'] = category_value
+                        master_data[subject_id]['tasks'][task_name]['session'] = session_value
+
+                        # Remove the DataFrame from memory
+                        del df
+
+                    # Collect PNG file paths from plot directory
+                    if os.path.exists(plots_path):
+                        png_files = [
+                            os.path.join(plots_path, png)
+                            for png in os.listdir(plots_path)
+                            if png.endswith('.png')
+                        ]
+                        master_data[subject_id]['tasks'][task_name]['png_paths'].extend(png_files)
+
+
+    import json
+
+
+    with open('../../data/data.json', 'w') as f:
+        json.dump(master_data, f, indent=2)
+    return None
+
+
+
@@ -0,0 +1,87 @@
+import os
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+class Group:
+
+    def __init__(self) -> None:
+        pass
+
+    @staticmethod
+    def load_task_data(task_name, relevant_columns, root_dir="../../../data"):
+        """
+        Loads all subject CSVs corresponding to a specified task, extracts relevant columns, 
+        and appends them (with the subject ID as the first column) into one DataFrame.
+        
+        Parameters:
+            task_name (str): The name of the task folder (e.g., "AF", "NTS", "DWL").
+            relevant_columns (list): A list of column names (strings) to extract from each CSV.
+            root_dir (str): The root data directory. Default is "Data".
+            
+        Returns:
+            pandas.DataFrame: A DataFrame containing the subject ID (as 'subjectID') and the relevant columns 
+                            from every CSV that was found.
+        """
+        # List to collect dataframes for each subject
+        data_frames = []
+        
+        # Loop through each study directory in the root directory
+        for study in os.listdir(root_dir):
+            study_path = os.path.join(root_dir, study)
+            if os.path.isdir(study_path):
+                # Loop through each site directory within the study directory
+                for site in os.listdir(study_path):
+                    site_path = os.path.join(study_path, site)
+                    if os.path.isdir(site_path):
+                        # Loop through each subject directory within the site directory
+                        for subject in os.listdir(site_path):
+                            subject_path = os.path.join(site_path, subject)
+                            if os.path.isdir(subject_path):
+                                # Construct the path to the task directory for this subject
+                                task_path = os.path.join(subject_path, task_name)
+                                if os.path.isdir(task_path):
+                                    # Look for the 'data' folder within the task directory
+                                    data_folder = os.path.join(task_path, "data")
+                                    if os.path.isdir(data_folder):
+                                        # Process each CSV file found in the data folder
+                                        for file in os.listdir(data_folder):
+                                            # if file ends with .csv and second part of the filename is 'ses-1'
+                                            if file.endswith("2.csv"):
+                                                continue
+                                            elif file.endswith(".csv") and "_cat-1_" in file:
+                                                csv_path = os.path.join(data_folder, file)
+                                                print(f"Processing {csv_path}")
+                                            # if the character before .csv is 2 then skip
+                                                try:
+                                                    # Load the CSV into a temporary DataFrame
+                                                    temp_df = pd.read_csv(csv_path)
+                                                    # Select only the relevant columns (if they exist)
+                                                    # It is assumed that every CSV contains all desired columns; 
+                                                    # you may wish to add error handling if some files do not.
+                                                    filtered_df = temp_df[relevant_columns].copy()
+                                                    # Insert the subject ID as the first column
+                                                    filtered_df.insert(0, "subjectID", subject)
+                                                    # Append the DataFrame for this subject to the list
+                                                    data_frames.append(filtered_df)
+                                                except Exception as e:
+                                                    print(f"Error processing {csv_path}: {e}")
+                                                finally:
+                                                    # Delete the temporary dataframe to free memory
+                                                    del temp_df
+                                                    del filtered_df
+                                            
+        # Concatenate all subject DataFrames into one, resetting the index
+        if data_frames:
+            final_df = pd.concat(data_frames, ignore_index=True)
+        else:
+            # If no data was found, return an empty DataFrame with the appropriate columns
+            final_df = pd.DataFrame(columns=["subjectID"] + relevant_columns)
+        
+        return final_df
+
+    def return_dfs(self):
+        pass
+