-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbellabeat_cleanup.R
More file actions
228 lines (212 loc) · 11.9 KB
/
bellabeat_cleanup.R
File metadata and controls
228 lines (212 loc) · 11.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#Install packages
install.packages("tidyverse")
install.packages("skimr")
install.packages("janitor")
install.packages("knitr")
install.packages("plyr")
install.packages("devtools")
require(devtools)
install_github("Displayr/flipTime")
#Import libraries
library(tidyverse)
library(skimr)
library(janitor)
library(knitr)
library(plyr)
library(lubridate)
library(flipTime)
# Load CSVs
daily_activity <- read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/dailyActivity_merged.csv")
daily_calories <- read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/dailyCalories_merged.csv")
daily_intensities <- read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/dailySteps_merged.csv")
daily_steps <- read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/dailyIntensities_merged.csv")
heartrate_seconds <-read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/heartrate_seconds_merged.csv")
hourly_calories <-read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/hourlyCalories_merged.csv")
hourly_intensities <-read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/hourlyIntensities_merged.csv")
hourly_steps <-read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/hourlySteps_merged.csv")
minute_calories_narrow <-read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/minuteCaloriesNarrow_merged.csv")
minute_calories_wide <-read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/minuteCaloriesWide_merged.csv")
minute_intensities_narrow <-read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/minuteIntensitiesNarrow_merged.csv")
minute_intensities_wide <-read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/minuteIntensitiesWide_merged.csv")
minute_mets_narrow <- read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/minuteMETsNarrow_merged.csv")
minute_sleep <- read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/minuteSleep_merged.csv")
minute_steps_narrow <- read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/minuteStepsNarrow_merged.csv")
minute_steps_wide <- read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/minuteStepsWide_merged.csv")
sleep_day <- read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/sleepDay_merged.csv")
weight_log_info <- read_csv("/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Source/weightLogInfo_merged.csv")
# Check head
head(daily_activity)
head(daily_calories)
head(daily_intensities)
head(daily_steps)
head(heartrate_seconds)
head(hourly_calories)
head(hourly_intensities)
head(hourly_steps)
head(minute_calories_narrow)
head(minute_calories_wide)
head(minute_intensities_narrow)
head(minute_intensities_wide)
head(minute_mets_narrow)
head(minute_sleep)
head(minute_steps_narrow)
head(minute_steps_wide)
head(sleep_day)
head(weight_log_info)
# Glimpse
glimpse(daily_activity)
glimpse(heartrate_seconds)
glimpse(hourly_calories)
glimpse(hourly_intensities)
glimpse(hourly_steps)
glimpse(minute_calories_narrow)
glimpse(minute_calories_wide)
glimpse(minute_intensities_narrow)
glimpse(minute_intensities_wide)
glimpse(minute_mets_narrow)
glimpse(minute_sleep)
glimpse(minute_steps_narrow)
glimpse(minute_steps_wide)
glimpse(sleep_day)
glimpse(weight_log_info)
# Skim without charts
skim_without_charts(daily_activity)
skim_without_charts(daily_calories)
skim_without_charts(daily_intensities)
skim_without_charts(daily_steps)
skim_without_charts(heartrate_seconds)
skim_without_charts(hourly_calories)
skim_without_charts(hourly_intensities)
skim_without_charts(hourly_steps)
skim_without_charts(minute_calories_narrow)
skim_without_charts(minute_calories_wide)
skim_without_charts(minute_intensities_narrow)
skim_without_charts(minute_intensities_wide)
skim_without_charts(minute_mets_narrow)
skim_without_charts(minute_sleep)
skim_without_charts(minute_steps_narrow)
skim_without_charts(minute_steps_wide)
skim_without_charts(sleep_day)
skim_without_charts(weight_log_info)
# Convert time column to datetime - Reference - https://www.displayr.com/r-date-conversion/
daily_activity$ActivityDate <- AsDateTime(daily_activity$ActivityDate)
daily_calories$ActivityDay <- AsDateTime(daily_calories$ActivityDay)
daily_intensities$ActivityDay <- AsDateTime(daily_intensities$ActivityDay)
daily_steps$ActivityDay <- AsDateTime(daily_steps$ActivityDay)
heartrate_seconds$Time <- AsDateTime(heartrate_seconds$Time)
hourly_calories$ActivityHour <- AsDateTime(hourly_calories$ActivityHour)
hourly_intensities$ActivityHour <- AsDateTime(hourly_intensities$ActivityHour)
hourly_steps$ActivityHour <- AsDateTime(hourly_steps$ActivityHour)
minute_calories_narrow$ActivityMinute <- AsDateTime(minute_calories_narrow$ActivityMinute)
minute_calories_wide$ActivityHour <- AsDateTime(minute_calories_wide$ActivityHour)
minute_intensities_narrow$ActivityMinute <- AsDateTime(minute_intensities_narrow$ActivityMinute)
minute_intensities_wide$ActivityHour <- AsDateTime(minute_intensities_wide$ActivityHour)
minute_mets_narrow$ActivityMinute <- AsDateTime(minute_mets_narrow$ActivityMinute)
minute_sleep$date <- AsDateTime(minute_sleep$date)
minute_steps_narrow$ActivityMinute <- AsDateTime(minute_steps_narrow$ActivityMinute)
minute_steps_wide$ActivityHour <- AsDateTime(minute_steps_wide$ActivityHour)
sleep_day$SleepDay <- AsDateTime(sleep_day$SleepDay)
weight_log_info$Date <- AsDateTime(weight_log_info$Date)
# Check column names
colnames(daily_activity)
colnames(daily_calories)
colnames(daily_intensities)
colnames(daily_steps)
colnames(heartrate_seconds)
colnames(hourly_calories)
colnames(hourly_intensities)
colnames(hourly_steps)
colnames(minute_calories_narrow)
colnames(minute_calories_wide)
colnames(minute_intensities_narrow)
colnames(minute_intensities_wide)
colnames(minute_mets_narrow)
colnames(minute_sleep)
colnames(minute_steps_narrow)
colnames(minute_steps_wide)
colnames(sleep_day)
colnames(weight_log_info)
# Check nulls - Output = None, except weight_log_info with 65 nulls
sum(is.na(daily_activity))
sum(is.na(daily_calories))
sum(is.na(daily_intensities))
sum(is.na(daily_steps))
sum(is.na(heartrate_seconds))
sum(is.na(hourly_calories))
sum(is.na(hourly_intensities))
sum(is.na(hourly_steps))
sum(is.na(minute_calories_narrow))
sum(is.na(minute_calories_wide))
sum(is.na(minute_intensities_narrow))
sum(is.na(minute_intensities_wide))
sum(is.na(minute_mets_narrow))
sum(is.na(minute_sleep))
sum(is.na(minute_steps_narrow))
sum(is.na(minute_steps_wide))
sum(is.na(sleep_day))
sum(is.na(weight_log_info)) # 65 nulls
#Check duplicates - Output = no duplicates, except for those noted below.
sum(duplicated(daily_activity))
sum(duplicated(daily_calories))
sum(duplicated(daily_intensities))
sum(duplicated(daily_steps))
sum(duplicated(heartrate_seconds))
sum(duplicated(hourly_calories))
sum(duplicated(hourly_intensities))
sum(duplicated(hourly_steps))
sum(duplicated(minute_calories_narrow))
sum(duplicated(minute_calories_wide))
sum(duplicated(minute_intensities_narrow))
sum(duplicated(minute_intensities_wide))
sum(duplicated(minute_mets_narrow))
sum(duplicated(minute_sleep)) # 543 duplicates
sum(duplicated(minute_steps_narrow))
sum(duplicated(minute_steps_wide))
sum(duplicated(sleep_day)) # 3 duplicates
sum(duplicated(weight_log_info))
# Identify duplicate rows
minute_sleep_duplicates <- minute_sleep[which(duplicated(minute_sleep)), ]
sleep_day_duplicates <- sleep_day[which(duplicated(sleep_day)), ]
# Drop duplicate rows
minute_sleep <- minute_sleep %>% distinct()
sleep_day <- sleep_day %>% distinct()
# Check for outliers - No extreme outliers observed
boxplot(daily_activity[, !names(daily_activity) %in% c("Id", "ActivityDate")])
boxplot(daily_calories[, !names(sleep_day) %in% c("Id", "ActivityDay")])
boxplot(daily_intensities[, !names(daily_intensities) %in% c("Id", "ActivityDay")])
boxplot(daily_steps[, !names(daily_steps) %in% c("Id", "ActivityDay")])
boxplot(heartrate_seconds[, !names(heartrate_seconds) %in% c("Id", "Time")])
boxplot(hourly_calories[, !names(hourly_calories) %in% c("Id", "ActivityHour")])
boxplot(hourly_steps[, !names(hourly_steps) %in% c("Id", "ActivityHour")])
boxplot(minute_calories_narrow[, !names(minute_calories_narrow) %in% c("Id", "ActivityMinute")])
boxplot(minute_calories_wide[, !names(minute_calories_wide) %in% c("Id", "ActivityMinute")])
boxplot(minute_intensities_narrow[, !names(minute_intensities_narrow) %in% c("Id", "ActivityMinute")])
boxplot(minute_intensities_wide[, !names(minute_intensities_wide) %in% c("Id", "ActivityMinute")])
boxplot(minute_mets_narrow[, !names(minute_mets_narrow) %in% c("Id", "ActivityHour")])
boxplot(minute_sleep[, !names(minute_sleep) %in% c("Id", "MinuteSleep", "logId")])
boxplot(minute_steps_narrow[, !names(minute_steps_narrow) %in% c("Id", "ActivityMinute")])
boxplot(minute_steps_wide[, !names(minute_steps_wide) %in% c("Id", "ActivityHour")])
boxplot(sleep_day[, !names(sleep_day) %in% c("Id", "SleepDay")])
boxplot(weight_log_info[, !names(weight_log_info) %in% c("Id", "Date", "IsManualReport", "LogId")])
# Check max outlier of hourly_calories and location
max(hourly_calories$Calories)
which(grepl(948, hourly_calories$Calories))
# Write to csv to load in BigQuery database
write_csv(daily_activity, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/dailyActivity_cleaned.csv")
write_csv(daily_calories, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/dailyCalories_cleaned.csv")
write_csv(daily_intensities, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/dailyIntensities_cleaned.csv")
write_csv(daily_steps, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/dailySteps_cleaned.csv")
write_csv(heartrate_seconds, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/heartrateSeconds_cleaned.csv")
write_csv(hourly_calories, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/hourlyCalories_cleaned.csv")
write_csv(hourly_intensities, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/hourlyIntensities_cleaned.csv")
write_csv(hourly_steps, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/hourlySteps_cleaned.csv")
write_csv(minute_calories_narrow, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/minuteCaloriesNarrow_cleaned.csv")
write_csv(minute_calories_wide, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/minuteCaloriesWide_cleaned.csv")
write_csv(minute_intensities_narrow, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/minuteIntensitiesNarrow_cleaned.csv")
write_csv(minute_intensities_wide, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/minuteIntensitiesWide_cleaned.csv")
write_csv(minute_mets_narrow, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/minuteMETsNarrow_cleaned.csv")
write_csv(minute_sleep, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/minuteSleep_cleaned.csv")
write_csv(minute_steps_narrow, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/minuteStepsNarrow_cleaned.csv")
write_csv(minute_steps_wide, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/minuteStepsWide_cleaned.csv")
write_csv(sleep_day, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/sleepDay_cleaned.csv")
write_csv(weight_log_info, file="/Users/tyesondemets/Desktop/Git/Health-App-Usage-Analytics/Resources/Cleaned/weightLogInfo_cleaned.csv")