-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
80 lines (57 loc) · 2.97 KB
/
preprocessing.py
File metadata and controls
80 lines (57 loc) · 2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import pandas as pd
from utils.result import Result
def get_activities_data(directory: str) -> Result[pd.DataFrame]:
df = pd.read_csv(f'{directory}/macro.csv')
# Filter only the activities TAKEOFF and EXPLORE
df = df[df.activity.isin(['TAKEOFF', 'EXPLORE'])]
# Drop the payload column
df = df.drop(['payload'], axis=1)
return Result(success=True, data=df)
def get_position_data(directory: str) -> Result[pd.DataFrame]:
df = pd.read_csv(f'{directory}/odom.csv')
return Result(success=True, data=df)
def build_artefacts():
directories = [os.path.join("data", f) for f in os.listdir('data') if os.path.isdir(os.path.join("data", f))]
for directory in directories:
df_activity = get_activities_data(directory).data
df_position = get_position_data(directory).data
df_activity['time'] = pd.to_datetime(df_activity['time'])
df_position['time'] = pd.to_datetime(df_position['time'])
# Sort both DataFrames by the datetime column
df_activity = df_activity.sort_values(by='time')
df_position = df_position.sort_values(by='time')
df_position = join_positions_and_activities(df_activity, df_position)
# Save the enriched dataset to a new CSV file
file_name = f'dataset_{directory.split("/")[-1]}'
df_position.to_csv(f'artefacts/{file_name}.csv', index=False)
def join_positions_and_activities(df_activity: pd.DataFrame, df_position: pd.DataFrame) -> pd.DataFrame:
# Initialize a column in odom_df to hold the activity information
df_position['activity'] = None
# Loop through macro_df to find activity time ranges
for i in range(len(df_activity) - 1):
row = df_activity.iloc[i]
next_row = df_activity.iloc[i + 1]
# Check if this row marks the start and the next row marks the end of an activity
if row['lifecycle'] == 'START' and next_row['lifecycle'] == 'STOP' and row['activity'] == next_row['activity']:
start_time = row['time']
end_time = next_row['time']
activity = row['activity']
# Assign activity only to rows within the start and end time interval in odom_df
df_position.loc[(df_position['time'] >= start_time) & (df_position['time'] <= end_time), 'activity'] = activity
# Set "IDLE" as the default activity for rows without any assigned activity
df_position['activity'] = df_position['activity'].fillna('IDLE')
return df_position
def join_artefacts():
frames = []
artefacts = [os.path.join("artefacts", f) for f in os.listdir('artefacts') if f.startswith('dataset_')]
# Loop through the artefacts
for frame in artefacts:
# Load the dataset
df_partial = pd.read_csv(frame)
# Append the dataset to the main DataFrame
frames.append(df_partial)
df = pd.concat(frames)
df = df.sort_values(by='time')
# Save the joined dataset to a new CSV file
df.to_csv('artefacts/training_dataset.csv', index=False)