1- import os
2- from multiprocessing import Pool
1+ from typing import Tuple
32import time
43
5- import pandas as pd
4+ import pandas as pd
65from tqdm import tqdm
76
87from topcoder_cognitive_state .CONSTANTS import METADATA_COLUMNS , NAN_VALUES
98
109
11- def _test_missing_features (df ):
10+ def _test_missing_features (df : pd .DataFrame ) -> pd .DataFrame :
11+ """
12+ This test contains three tests which are run manually:
13+
14+ 1. Check if some columns are missing
15+ 2. Check if some columns have None values
16+ 3. Check if some columns have -9999.9 (missing) values
17+ """
1218 # cols = ['ViveEye_gazeOrigin_L_X', 'ViveEye_gazeOrigin_L_Y', 'ViveEye_gazeOrigin_L_Z']
1319 # cols = ['Myo_EMG_0', 'Myo_EMG_1', 'Myo_EMG_2', 'Myo_EMG_3', 'Myo_EMG_4', 'Myo_EMG_5', 'Myo_EMG_6']
1420 # cols = ['Polar_bpm', 'Polar_hrv', 'tlx_score']
1521 cols = [
1622 # features
17- 'tlx_score' , 'E4_BVP' , 'E4_GSR' , 'LooxidLink_EEG_A3' , 'LooxidLink_EEG_A4' ,
18- 'LooxidLink_EEG_FP1' , 'LooxidLink_EEG_FP2' , 'LooxidLink_EEG_A7' , 'LooxidLink_EEG_A8' ,
19-
20- 'Muse_EEG_TP9' , 'Muse_EEG_AF7' , 'Muse_EEG_AF8' , 'Muse_EEG_TP10' ,
21- 'Muse_PPG_0' , 'Muse_PPG_1' , 'Muse_PPG_2' ,
22-
23- 'Myo_GYR_X' , 'Myo_GYR_Y' , 'Myo_GYR_Z' ,
24- 'Myo_EMG_0' , 'Myo_EMG_1' , 'Myo_EMG_2' , 'Myo_EMG_3' , 'Myo_EMG_4' , 'Myo_EMG_5' , 'Myo_EMG_6' , 'Myo_EMG_7' ,
25-
26- 'PICARD_fnirs_0' , 'PICARD_fnirs_1' ,
27-
28- 'Polar_bpm' , 'Polar_hrv' ,
29-
30- 'ViveEye_pupilPos_L_X' , 'ViveEye_pupilPos_L_Y' ,
31- 'ViveEye_pupilPos_R_X' , 'ViveEye_pupilPos_R_Y' ,
32-
33- 'ViveEye_gazeOrigin_L_X' , 'ViveEye_gazeOrigin_L_Y' , 'ViveEye_gazeOrigin_L_Z' ,
34- 'ViveEye_gazeOrigin_R_X' , 'ViveEye_gazeOrigin_R_Y' , 'ViveEye_gazeOrigin_R_Z' ,
35- 'ViveEye_gazeDirection_L_X' , 'ViveEye_gazeDirection_L_Y' , 'ViveEye_gazeDirection_L_Z' ,
36- 'ViveEye_gazeDirection_R_X' , 'ViveEye_gazeDirection_R_Y' , 'ViveEye_gazeDirection_R_Z' ,
37-
38- 'ViveEye_eyeOpenness_L' , 'ViveEye_pupilDiameter_L' ,
39- 'ViveEye_eyeOpenness_R' , 'ViveEye_pupilDiameter_R' ,
40-
41- 'Zephyr_HR' , 'Zephyr_HRV' ,
23+ "tlx_score" ,
24+ "E4_BVP" ,
25+ "E4_GSR" ,
26+ "LooxidLink_EEG_A3" ,
27+ "LooxidLink_EEG_A4" ,
28+ "LooxidLink_EEG_FP1" ,
29+ "LooxidLink_EEG_FP2" ,
30+ "LooxidLink_EEG_A7" ,
31+ "LooxidLink_EEG_A8" ,
32+ "Muse_EEG_TP9" ,
33+ "Muse_EEG_AF7" ,
34+ "Muse_EEG_AF8" ,
35+ "Muse_EEG_TP10" ,
36+ "Muse_PPG_0" ,
37+ "Muse_PPG_1" ,
38+ "Muse_PPG_2" ,
39+ "Myo_GYR_X" ,
40+ "Myo_GYR_Y" ,
41+ "Myo_GYR_Z" ,
42+ "Myo_EMG_0" ,
43+ "Myo_EMG_1" ,
44+ "Myo_EMG_2" ,
45+ "Myo_EMG_3" ,
46+ "Myo_EMG_4" ,
47+ "Myo_EMG_5" ,
48+ "Myo_EMG_6" ,
49+ "Myo_EMG_7" ,
50+ "PICARD_fnirs_0" ,
51+ "PICARD_fnirs_1" ,
52+ "Polar_bpm" ,
53+ "Polar_hrv" ,
54+ "ViveEye_pupilPos_L_X" ,
55+ "ViveEye_pupilPos_L_Y" ,
56+ "ViveEye_pupilPos_R_X" ,
57+ "ViveEye_pupilPos_R_Y" ,
58+ "ViveEye_gazeOrigin_L_X" ,
59+ "ViveEye_gazeOrigin_L_Y" ,
60+ "ViveEye_gazeOrigin_L_Z" ,
61+ "ViveEye_gazeOrigin_R_X" ,
62+ "ViveEye_gazeOrigin_R_Y" ,
63+ "ViveEye_gazeOrigin_R_Z" ,
64+ "ViveEye_gazeDirection_L_X" ,
65+ "ViveEye_gazeDirection_L_Y" ,
66+ "ViveEye_gazeDirection_L_Z" ,
67+ "ViveEye_gazeDirection_R_X" ,
68+ "ViveEye_gazeDirection_R_Y" ,
69+ "ViveEye_gazeDirection_R_Z" ,
70+ "ViveEye_eyeOpenness_L" ,
71+ "ViveEye_pupilDiameter_L" ,
72+ "ViveEye_eyeOpenness_R" ,
73+ "ViveEye_pupilDiameter_R" ,
74+ "Zephyr_HR" ,
75+ "Zephyr_HRV" ,
4276 ]
4377
4478 # case 1 - no column
45- # df = df.drop(cols, axis=1)
79+ # df = df.drop(cols, axis=1)
4680
4781 # case 2 - None values
48- #for col in cols:
82+ # for col in cols:
4983 # df[col] = None
5084
51- # case 3 - missing values
85+ # case 3 - missing values
5286 for col in cols :
5387 df [col ] = - 9999.9
5488 return df
5589
5690
57- def read_and_prepare_data_chunk (df ):
91+ def read_and_prepare_data_chunk (df : pd .DataFrame ) -> pd .DataFrame :
92+ """
93+ Read raw data and prepare it for processing.
94+ I.e., create columns if they are missing,
95+ replace missing values with None,
96+ etc.
97+
98+ Args:
99+ df (pd.DataFrame): input raw data
100+
101+ Returns:
102+ pd.DataFrame: processed data
103+ """
58104 EXPECTED_COLUMNS = [
59105 # features
60- 'tlx_score' , 'E4_BVP' , 'E4_GSR' , 'LooxidLink_EEG_A3' , 'LooxidLink_EEG_A4' ,
61- 'LooxidLink_EEG_FP1' , 'LooxidLink_EEG_FP2' , 'LooxidLink_EEG_A7' , 'LooxidLink_EEG_A8' ,
62-
63- 'Muse_EEG_TP9' , 'Muse_EEG_AF7' , 'Muse_EEG_AF8' , 'Muse_EEG_TP10' ,
64- 'Muse_PPG_0' , 'Muse_PPG_1' , 'Muse_PPG_2' ,
65-
66- 'Myo_GYR_X' , 'Myo_GYR_Y' , 'Myo_GYR_Z' ,
67- 'Myo_EMG_0' , 'Myo_EMG_1' , 'Myo_EMG_2' , 'Myo_EMG_3' , 'Myo_EMG_4' , 'Myo_EMG_5' , 'Myo_EMG_6' , 'Myo_EMG_7' ,
68-
69- 'PICARD_fnirs_0' , 'PICARD_fnirs_1' ,
70-
71- 'Polar_bpm' , 'Polar_hrv' ,
72-
73- 'ViveEye_pupilPos_L_X' , 'ViveEye_pupilPos_L_Y' ,
74- 'ViveEye_pupilPos_R_X' , 'ViveEye_pupilPos_R_Y' ,
75-
76- 'ViveEye_gazeOrigin_L_X' , 'ViveEye_gazeOrigin_L_Y' , 'ViveEye_gazeOrigin_L_Z' ,
77- 'ViveEye_gazeOrigin_R_X' , 'ViveEye_gazeOrigin_R_Y' , 'ViveEye_gazeOrigin_R_Z' ,
78- 'ViveEye_gazeDirection_L_X' , 'ViveEye_gazeDirection_L_Y' , 'ViveEye_gazeDirection_L_Z' ,
79- 'ViveEye_gazeDirection_R_X' , 'ViveEye_gazeDirection_R_Y' , 'ViveEye_gazeDirection_R_Z' ,
80-
81- 'ViveEye_eyeOpenness_L' , 'ViveEye_pupilDiameter_L' ,
82- 'ViveEye_eyeOpenness_R' , 'ViveEye_pupilDiameter_R' ,
83-
84- 'Zephyr_HR' , 'Zephyr_HRV' ,
85-
106+ "tlx_score" ,
107+ "E4_BVP" ,
108+ "E4_GSR" ,
109+ "LooxidLink_EEG_A3" ,
110+ "LooxidLink_EEG_A4" ,
111+ "LooxidLink_EEG_FP1" ,
112+ "LooxidLink_EEG_FP2" ,
113+ "LooxidLink_EEG_A7" ,
114+ "LooxidLink_EEG_A8" ,
115+ "Muse_EEG_TP9" ,
116+ "Muse_EEG_AF7" ,
117+ "Muse_EEG_AF8" ,
118+ "Muse_EEG_TP10" ,
119+ "Muse_PPG_0" ,
120+ "Muse_PPG_1" ,
121+ "Muse_PPG_2" ,
122+ "Myo_GYR_X" ,
123+ "Myo_GYR_Y" ,
124+ "Myo_GYR_Z" ,
125+ "Myo_EMG_0" ,
126+ "Myo_EMG_1" ,
127+ "Myo_EMG_2" ,
128+ "Myo_EMG_3" ,
129+ "Myo_EMG_4" ,
130+ "Myo_EMG_5" ,
131+ "Myo_EMG_6" ,
132+ "Myo_EMG_7" ,
133+ "PICARD_fnirs_0" ,
134+ "PICARD_fnirs_1" ,
135+ "Polar_bpm" ,
136+ "Polar_hrv" ,
137+ "ViveEye_pupilPos_L_X" ,
138+ "ViveEye_pupilPos_L_Y" ,
139+ "ViveEye_pupilPos_R_X" ,
140+ "ViveEye_pupilPos_R_Y" ,
141+ "ViveEye_gazeOrigin_L_X" ,
142+ "ViveEye_gazeOrigin_L_Y" ,
143+ "ViveEye_gazeOrigin_L_Z" ,
144+ "ViveEye_gazeOrigin_R_X" ,
145+ "ViveEye_gazeOrigin_R_Y" ,
146+ "ViveEye_gazeOrigin_R_Z" ,
147+ "ViveEye_gazeDirection_L_X" ,
148+ "ViveEye_gazeDirection_L_Y" ,
149+ "ViveEye_gazeDirection_L_Z" ,
150+ "ViveEye_gazeDirection_R_X" ,
151+ "ViveEye_gazeDirection_R_Y" ,
152+ "ViveEye_gazeDirection_R_Z" ,
153+ "ViveEye_eyeOpenness_L" ,
154+ "ViveEye_pupilDiameter_L" ,
155+ "ViveEye_eyeOpenness_R" ,
156+ "ViveEye_pupilDiameter_R" ,
157+ "Zephyr_HR" ,
158+ "Zephyr_HRV" ,
86159 # target
87- "induced_state"
160+ "induced_state" ,
88161 ]
89162
163+ # uncomment to enable test
90164 # df = _test_missing_features(df)
91165
92166 # test_suite
93- if ' test_suite' not in df .columns :
94- df [' test_suite' ] = "test"
167+ if " test_suite" not in df .columns :
168+ df [" test_suite" ] = "test"
95169
96170 df ["time" ] = pd .to_datetime (df ["time" ], unit = "us" )
97171 df ["timestamp" ] = df ["time" ].dt .round ("1s" )
@@ -118,30 +192,48 @@ def read_and_prepare_data_chunk(df):
118192 return ags
119193
120194
121- def get_dummy_template (df ):
195+ def get_dummy_template (df : pd .DataFrame ) -> pd .DataFrame :
196+ """
197+ The template is needed to match the expected sample submission format.
198+ """
122199 df ["time" ] = pd .to_datetime (df ["time" ], unit = "us" )
123200 df ["timestamp" ] = df ["time" ].dt .round ("1s" )
124201 df = df .drop ("time" , axis = 1 )
125202 dummy_template = df .drop_duplicates (
126- subset = METADATA_COLUMNS + ["timestamp" ],
127- keep = "first"
203+ subset = METADATA_COLUMNS + ["timestamp" ], keep = "first"
128204 ).reset_index (drop = True )
129205 dummy_template = dummy_template [METADATA_COLUMNS + ["timestamp" ]]
130206 return dummy_template
131207
132208
133- def get_needed_data (df ):
209+ def get_needed_data (df : pd .DataFrame ) -> Tuple [pd .DataFrame , pd .DataFrame ]:
210+ """
211+ Read data for training/testing and prepare template format for submission
212+
213+ Return:
214+ res1 - pd.DataFrame - read data
215+ res2 - pd.DataFrame - template for submission
216+ """
134217 res1 = read_and_prepare_data_chunk (df )
135218 res2 = get_dummy_template (df )
136- return [ res1 , res2 ]
219+ return res1 , res2
137220
138221
139222def read_data (
140- path_to_data : str ,
141- debug : bool = False
142- ) -> pd .DataFrame :
223+ path_to_data : str , debug : bool = False
224+ ) -> Tuple [pd .DataFrame , pd .DataFrame ]:
225+ """
226+ Read data. The data is read in chunks to reduce memory consumption.
227+
228+ Args:
229+ path_to_data (str): path to data
230+ debug (bool, optional): run data loading on a sample of data. Defaults to False.
231+
232+ Returns:
233+ Tuple[pd.DataFrame, pd.DataFrame]: Read data and prepared template for submission
234+ """
143235 t_start = time .time ()
144- chunksize = 10 ** 6
236+ chunksize = 10 ** 6
145237
146238 if path_to_data is None :
147239 path_to_data = "./data/training-data.zip"
@@ -152,11 +244,9 @@ def read_data(
152244 else :
153245 nrows = None
154246
247+ # create chunks iterator to read data
155248 chunks = pd .read_csv (
156- path_to_data ,
157- na_values = NAN_VALUES ,
158- chunksize = chunksize ,
159- nrows = nrows
249+ path_to_data , na_values = NAN_VALUES , chunksize = chunksize , nrows = nrows
160250 )
161251
162252 # get data for processing
@@ -167,12 +257,14 @@ def read_data(
167257 res = [i [0 ] for i in full_result ]
168258 res = pd .concat (res , axis = 0 )
169259 res = res .sort_index ()
170- res = res [~ res .index .duplicated (keep = ' first' )]
260+ res = res [~ res .index .duplicated (keep = " first" )]
171261
172262 # collect dummies for sub
173263 res2 = [i [1 ] for i in full_result ]
174264 res2 = pd .concat (res2 , axis = 0 )
175- res2 = res2 .drop_duplicates (subset = METADATA_COLUMNS + ["timestamp" ], keep = "first" ).reset_index (drop = True )
265+ res2 = res2 .drop_duplicates (
266+ subset = METADATA_COLUMNS + ["timestamp" ], keep = "first"
267+ ).reset_index (drop = True )
176268 t_end = time .time ()
177269 print (f"Data is read. Time per reading: { (t_end - t_start )/ 60 :.2f} minutes" )
178270 return res , res2
0 commit comments