55from pandas .tseries .frequencies import to_offset
66import scipy .stats as stats
77
8- from actinet .utils .utils import date_parser , toScreen
8+ from actinet .utils .utils import date_parser , to_screen
9+ from actinet .utils .summary_utils import *
910from actinet import circadian
1011
1112
12- def getActivitySummary (
13+ def get_activity_summary (
1314 data ,
1415 labels ,
1516 intensityDistribution = False ,
@@ -22,19 +23,19 @@ def getActivitySummary(
2223 1) calculate imputation values to replace nan PA metric values
2324 2) calculate empirical cumulative distribution function of vector magnitudes
2425 3) derive main movement summaries (overall, weekday/weekend, and hour)
26+ 4) derive daily summaries (daily enmo and daily activity)
2527
2628 :param str data: Input csv.gz file or pandas dataframe of processed epoch data
2729 :param list(str) labels: Activity state labels
2830 :param bool intensityDistribution: Add intensity outputs to dict <summary>
2931 :param bool circadianMetrics: Add circadian rhythm metrics to dict <summary>
3032 :param bool verbose: Print verbose output
3133
32- :return: A summary of the activity.
33- :rtype: dict
34+ :return: A summary of the activity and daily wear statistics .
35+ :rtype: tuple( dict, pd.DataFrame)
3436
3537 """
36-
37- toScreen ("=== Summarizing ===" , verbose )
38+ to_screen ("=== Summarizing ===" , verbose )
3839
3940 if isinstance (data , str ):
4041 data = pd .read_csv (
@@ -44,21 +45,42 @@ def getActivitySummary(
4445 date_parser = date_parser ,
4546 )
4647
48+ # Impute missing values
49+ data_imputed = _impute_missing (data , labels , verbose )
50+
4751 # Main movement summaries
4852 summary = _summarise (
4953 data ,
54+ data_imputed ,
5055 labels ,
5156 intensityDistribution ,
5257 circadianMetrics ,
5358 verbose ,
5459 )
5560
56- # Return physical activity summary
57- return summary
61+ # Daily summaries
62+ daily_summary = _daily_summary (data , data_imputed , labels , verbose )
63+
64+ # Return physical activity summaries
65+ return summary , daily_summary
66+
67+
68+ def _impute_missing (data , labels , verbose = False ):
69+ # In the following, we resample, pad and impute the data so that we have a
70+ # multiple of 24h for the stats calculations
71+ to_screen ("=== Imputing missing values ===" , verbose )
72+
73+ cols = ["acc" ] + labels
74+ if "MET" in data .columns :
75+ cols .append ("MET" )
76+ data_imputed = impute_missing (data [cols ].astype ("float" ))
77+
78+ return data_imputed
5879
5980
6081def _summarise (
6182 data ,
83+ data_imputed ,
6284 labels ,
6385 intensityDistribution = False ,
6486 circadianMetrics = False ,
@@ -68,6 +90,7 @@ def _summarise(
6890 """Overall summary stats for each activity type to summary dict
6991
7092 :param pandas.DataFrame data: Pandas dataframe of epoch data
93+ :param pandas.DataFrame data_adjusted: Pandas dataframe of epoch data with imputed missing values
7194 :param list(str) labels: Activity state labels
7295 :param dict summary: Output dictionary containing all summary metrics
7396 :param bool intensityDistribution: Add intensity outputs to dict <summary>
@@ -103,22 +126,17 @@ def _summarise(
103126 if intensityDistribution :
104127 summary = calculateECDF (data ["acc" ], summary )
105128
106- # In the following, we resample, pad and impute the data so that we have a
107- # multiple of 24h for the stats calculations
108129 tStart , tEnd = data .index [0 ], data .index [- 1 ]
109- cols = ["acc" ] + labels
110- if "MET" in data .columns :
111- cols .append ("MET" )
112- data = imputeMissing (data [cols ].astype ("float" ))
113130
114131 # Overall stats (no padding, i.e. only within recording period)
115- toScreen ("=== Calculating overall statistics ===" , verbose )
116- overallStats = data [tStart :tEnd ].apply (["mean" , "std" ])
132+ to_screen ("=== Calculating overall statistics ===" , verbose )
133+ overallStats = data_imputed [tStart :tEnd ].apply (["mean" , "std" ])
117134 for col in overallStats :
118135 summary [f"{ col } -overall-avg" ] = overallStats [col ].loc ["mean" ]
119136 summary [f"{ col } -overall-sd" ] = overallStats [col ].loc ["std" ]
120137
121- dayOfWeekStats = data .groupby ([data .index .weekday , data .index .hour ]).mean ()
138+ dayOfWeekStats = data_imputed .groupby ([data_imputed .index .weekday ,
139+ data_imputed .index .hour ]).mean ()
122140 dayOfWeekStats .index = dayOfWeekStats .index .set_levels (
123141 dayOfWeekStats .index .levels [0 ]
124142 .to_series ()
@@ -170,133 +188,24 @@ def _summarise(
170188
171189 # Calculate circadian metrics
172190 if circadianMetrics :
173- toScreen ("=== Calculating circadian metrics ===" , verbose )
174- summary = circadian .calculatePSD (data , epochPeriod , False , labels , summary )
175- summary = circadian .calculatePSD (data , epochPeriod , True , labels , summary )
191+ to_screen ("=== Calculating circadian metrics ===" , verbose )
192+ summary = circadian .calculatePSD (data_imputed , epochPeriod , False , labels , summary )
193+ summary = circadian .calculatePSD (data_imputed , epochPeriod , True , labels , summary )
176194 summary = circadian .calculateFourierFreq (
177- data , epochPeriod , False , labels , summary
195+ data_imputed , epochPeriod , False , labels , summary
178196 )
179197 summary = circadian .calculateFourierFreq (
180- data , epochPeriod , False , labels , summary
198+ data_imputed , epochPeriod , True , labels , summary
181199 )
182- summary = circadian .calculateM10L5 (data , epochPeriod , summary )
200+ summary = circadian .calculateM10L5 (data_imputed , epochPeriod , summary )
183201
184202 return summary
185203
186204
187- def imputeMissing (data , extrapolate = True ):
188- """Impute missing/nonwear segments
189-
190- Impute non-wear data segments using the average of similar time-of-day values
191- with one minute granularity on different days of the measurement. This
192- imputation accounts for potential wear time diurnal bias where, for example,
193- if the device was systematically less worn during sleep in an individual,
194- the crude average vector magnitude during wear time would be a biased
195- overestimate of the true average. See
196- https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0169649#sec013
197-
198- :param pandas.DataFrame e: Pandas dataframe of epoch data
199- :param bool verbose: Print verbose output
200-
201- :return: Update DataFrame <e> columns nan values with time-of-day imputation
202- :rtype: void
203- """
204-
205- if extrapolate :
206- # padding at the boundaries to have full 24h
207- data = data .reindex (
208- pd .date_range (
209- data .index [0 ].floor ("D" ),
210- data .index [- 1 ].ceil ("D" ),
211- freq = to_offset (pd .infer_freq (data .index )),
212- inclusive = "left" ,
213- name = "time" ,
214- ),
215- method = "nearest" ,
216- tolerance = pd .Timedelta ("1m" ),
217- limit = 1 ,
218- )
219-
220- def fillna (subframe ):
221- # Transform will first pass the subframe column-by-column as a Series.
222- # After passing all columns, it will pass the entire subframe again as a DataFrame.
223- # Processing the entire subframe is optional (return value can be omitted). See 'Notes' in transform doc.
224- if isinstance (subframe , pd .Series ):
225- x = subframe .to_numpy ()
226- nan = np .isnan (x )
227- nanlen = len (x [nan ])
228- if 0 < nanlen < len (x ): # check x contains a NaN and is not all NaN
229- x [nan ] = np .nanmean (x )
230- return x # will be cast back to a Series automatically
231- else :
232- return subframe
233-
234- data = (
235- data
236- # first attempt imputation using same day of week
237- .groupby ([data .index .weekday , data .index .hour , data .index .minute ])
238- .transform (fillna )
239- # then try within weekday/weekend
240- .groupby ([data .index .weekday >= 5 , data .index .hour , data .index .minute ])
241- .transform (fillna )
242- # finally, use all other days
243- .groupby ([data .index .hour , data .index .minute ])
244- .transform (fillna )
245- )
246-
247- return data
248-
249-
250- def infer_freq (x ):
251- """Like pd.infer_freq but more forgiving"""
252- freq , _ = stats .mode (np .diff (x ), keepdims = False )
253- freq = pd .Timedelta (freq )
254- return freq
255-
256-
257- def calculateECDF (x , summary ):
258- """Calculate activity intensity empirical cumulative distribution
259-
260- The input data must not be imputed, as ECDF requires different imputation
261- where nan/non-wear data segments are IMPUTED FOR EACH INTENSITY LEVEL. Here,
262- the average of similar time-of-day values is imputed with one minute
263- granularity on different days of the measurement. Following intensity levels
264- are calculated:
265- 1mg bins from 1-20mg
266- 5mg bins from 25-100mg
267- 25mg bins from 125-500mg
268- 100mg bins from 500-2000mg
269-
270- :param pandas.DataFrame e: Pandas dataframe of epoch data
271- :param str inputCol: Column to calculate intensity distribution on
272- :param dict summary: Output dictionary containing all summary metrics
273-
274- :return: Updated summary file
275- :rtype: dict
276- """
277-
278- levels = np .concatenate (
279- [
280- np .linspace (1 , 20 , 20 ), # 1mg bins from 1-20mg
281- np .linspace (25 , 100 , 16 ), # 5mg bins from 25-100mg
282- np .linspace (125 , 500 , 16 ), # 25mg bins from 125-500mg
283- np .linspace (600 , 2000 , 15 ), # 100mg bins from 500-2000mg
284- ]
285- ).astype ("int" )
286-
287- whrnan = x .isna ().to_numpy ()
288- ecdf = x .to_numpy ().reshape (- 1 , 1 ) <= levels .reshape (1 , - 1 )
289- ecdf [whrnan ] = np .nan
290-
291- ecdf = (
292- pd .DataFrame (ecdf , index = x .index , columns = levels )
293- .groupby ([x .index .hour , x .index .minute ])
294- .mean () # first average is across same time of day
295- .mean () # second average is within each level
296- )
297-
298- # Write to summary
299- for level , val in ecdf .items ():
300- summary [f"{ x .name } -ecdf-{ level } mg" ] = val
301-
302- return summary
205+ def _daily_summary (data , data_imputed , labels , verbose = False ):
206+ to_screen ("=== Daily summary ===" , verbose )
207+ daily_enmo = summarize_daily_enmo (data ["acc" ], data_imputed ["acc" ])
208+ daily_activity = summarize_daily_activity (data [labels ], data_imputed [labels ], labels )
209+ daily_summary = pd .concat ([daily_enmo , daily_activity ], axis = 1 )
210+ daily_summary .index .name = "Date"
211+ return daily_summary
0 commit comments