Skip to content

Commit 2477cf1

Browse files
committed
Daily summary
1 parent a3e59a7 commit 2477cf1

File tree

3 files changed

+332
-142
lines changed

3 files changed

+332
-142
lines changed

src/actinet/actinet.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
from actinet import __classifier_md5__
1919
from actinet.accPlot import plotTimeSeries
2020
from actinet.models import ActivityClassifier
21-
from actinet.summarisation import getActivitySummary
21+
from actinet.summarisation import get_activity_summary
22+
from actinet.utils.summary_utils import calculate_daily_wear_stats
2223
from actinet.utils.utils import infer_freq, drop_first_last_days, flag_wear_below_days, calculate_wear_stats
2324

2425
BASE_URL = "https://wearables-files.ndph.ox.ac.uk/files/models/actinet/"
@@ -173,7 +174,7 @@ def main():
173174
args.csv_start_row-1, # -1 to convert to zero-based index
174175
args.csv_date_format,
175176
args.calibration_stdtol_min,
176-
resample_hz=None,
177+
resample_hz="uniform",
177178
sample_rate=args.sample_rate,
178179
verbose=verbose,
179180
)
@@ -190,6 +191,8 @@ def main():
190191
# Update wear time stats after exclusions
191192
info.update(calculate_wear_stats(data))
192193

194+
daily_wear_stats = calculate_daily_wear_stats(data)
195+
193196
# Output paths
194197
basename = resolve_path(args.filepath)[1]
195198
outdir = os.path.join(args.outdir, basename)
@@ -262,7 +265,8 @@ def main():
262265
print("Output plot written to:", plotFile)
263266

264267
# Summary
265-
summary = getActivitySummary(Y, list(classifier.labels), True, True, verbose)
268+
summary, daily_summary = get_activity_summary(Y, list(classifier.labels),
269+
True, True, verbose)
266270

267271
# Join the actipy processing info, with acitivity summary data
268272
outputSummary = {**summary, **info}
@@ -274,6 +278,11 @@ def main():
274278
if verbose:
275279
print("Output summary written to:", outputSummaryFile)
276280

281+
daily_summary = pd.concat([daily_wear_stats, daily_summary], axis=1)
282+
283+
daily_summary.insert(0, 'Filename', info['Filename']) # add filename for reference
284+
daily_summary.to_csv(f"{outdir}/{basename}-Daily.csv.gz")
285+
277286
# Print
278287
if verbose:
279288
print("\nSummary Stats\n---------------------")

src/actinet/summarisation.py

Lines changed: 48 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,12 @@
55
from pandas.tseries.frequencies import to_offset
66
import scipy.stats as stats
77

8-
from actinet.utils.utils import date_parser, toScreen
8+
from actinet.utils.utils import date_parser, to_screen
9+
from actinet.utils.summary_utils import *
910
from actinet import circadian
1011

1112

12-
def getActivitySummary(
13+
def get_activity_summary(
1314
data,
1415
labels,
1516
intensityDistribution=False,
@@ -22,19 +23,19 @@ def getActivitySummary(
2223
1) calculate imputation values to replace nan PA metric values
2324
2) calculate empirical cumulative distribution function of vector magnitudes
2425
3) derive main movement summaries (overall, weekday/weekend, and hour)
26+
4) derive daily summaries (daily enmo and daily activity)
2527
2628
:param str data: Input csv.gz file or pandas dataframe of processed epoch data
2729
:param list(str) labels: Activity state labels
2830
:param bool intensityDistribution: Add intensity outputs to dict <summary>
2931
:param bool circadianMetrics: Add circadian rhythm metrics to dict <summary>
3032
:param bool verbose: Print verbose output
3133
32-
:return: A summary of the activity.
33-
:rtype: dict
34+
:return: A summary of the activity and daily wear statistics.
35+
:rtype: tuple(dict, pd.DataFrame)
3436
3537
"""
36-
37-
toScreen("=== Summarizing ===", verbose)
38+
to_screen("=== Summarizing ===", verbose)
3839

3940
if isinstance(data, str):
4041
data = pd.read_csv(
@@ -44,21 +45,42 @@ def getActivitySummary(
4445
date_parser=date_parser,
4546
)
4647

48+
# Impute missing values
49+
data_imputed = _impute_missing(data, labels, verbose)
50+
4751
# Main movement summaries
4852
summary = _summarise(
4953
data,
54+
data_imputed,
5055
labels,
5156
intensityDistribution,
5257
circadianMetrics,
5358
verbose,
5459
)
5560

56-
# Return physical activity summary
57-
return summary
61+
# Daily summaries
62+
daily_summary = _daily_summary(data, data_imputed, labels, verbose)
63+
64+
# Return physical activity summaries
65+
return summary, daily_summary
66+
67+
68+
def _impute_missing(data, labels, verbose=False):
69+
# In the following, we resample, pad and impute the data so that we have a
70+
# multiple of 24h for the stats calculations
71+
to_screen("=== Imputing missing values ===", verbose)
72+
73+
cols = ["acc"] + labels
74+
if "MET" in data.columns:
75+
cols.append("MET")
76+
data_imputed = impute_missing(data[cols].astype("float"))
77+
78+
return data_imputed
5879

5980

6081
def _summarise(
6182
data,
83+
data_imputed,
6284
labels,
6385
intensityDistribution=False,
6486
circadianMetrics=False,
@@ -68,6 +90,7 @@ def _summarise(
6890
"""Overall summary stats for each activity type to summary dict
6991
7092
:param pandas.DataFrame data: Pandas dataframe of epoch data
93+
:param pandas.DataFrame data_adjusted: Pandas dataframe of epoch data with imputed missing values
7194
:param list(str) labels: Activity state labels
7295
:param dict summary: Output dictionary containing all summary metrics
7396
:param bool intensityDistribution: Add intensity outputs to dict <summary>
@@ -103,22 +126,17 @@ def _summarise(
103126
if intensityDistribution:
104127
summary = calculateECDF(data["acc"], summary)
105128

106-
# In the following, we resample, pad and impute the data so that we have a
107-
# multiple of 24h for the stats calculations
108129
tStart, tEnd = data.index[0], data.index[-1]
109-
cols = ["acc"] + labels
110-
if "MET" in data.columns:
111-
cols.append("MET")
112-
data = imputeMissing(data[cols].astype("float"))
113130

114131
# Overall stats (no padding, i.e. only within recording period)
115-
toScreen("=== Calculating overall statistics ===", verbose)
116-
overallStats = data[tStart:tEnd].apply(["mean", "std"])
132+
to_screen("=== Calculating overall statistics ===", verbose)
133+
overallStats = data_imputed[tStart:tEnd].apply(["mean", "std"])
117134
for col in overallStats:
118135
summary[f"{col}-overall-avg"] = overallStats[col].loc["mean"]
119136
summary[f"{col}-overall-sd"] = overallStats[col].loc["std"]
120137

121-
dayOfWeekStats = data.groupby([data.index.weekday, data.index.hour]).mean()
138+
dayOfWeekStats = data_imputed.groupby([data_imputed.index.weekday,
139+
data_imputed.index.hour]).mean()
122140
dayOfWeekStats.index = dayOfWeekStats.index.set_levels(
123141
dayOfWeekStats.index.levels[0]
124142
.to_series()
@@ -170,133 +188,24 @@ def _summarise(
170188

171189
# Calculate circadian metrics
172190
if circadianMetrics:
173-
toScreen("=== Calculating circadian metrics ===", verbose)
174-
summary = circadian.calculatePSD(data, epochPeriod, False, labels, summary)
175-
summary = circadian.calculatePSD(data, epochPeriod, True, labels, summary)
191+
to_screen("=== Calculating circadian metrics ===", verbose)
192+
summary = circadian.calculatePSD(data_imputed, epochPeriod, False, labels, summary)
193+
summary = circadian.calculatePSD(data_imputed, epochPeriod, True, labels, summary)
176194
summary = circadian.calculateFourierFreq(
177-
data, epochPeriod, False, labels, summary
195+
data_imputed, epochPeriod, False, labels, summary
178196
)
179197
summary = circadian.calculateFourierFreq(
180-
data, epochPeriod, False, labels, summary
198+
data_imputed, epochPeriod, True, labels, summary
181199
)
182-
summary = circadian.calculateM10L5(data, epochPeriod, summary)
200+
summary = circadian.calculateM10L5(data_imputed, epochPeriod, summary)
183201

184202
return summary
185203

186204

187-
def imputeMissing(data, extrapolate=True):
188-
"""Impute missing/nonwear segments
189-
190-
Impute non-wear data segments using the average of similar time-of-day values
191-
with one minute granularity on different days of the measurement. This
192-
imputation accounts for potential wear time diurnal bias where, for example,
193-
if the device was systematically less worn during sleep in an individual,
194-
the crude average vector magnitude during wear time would be a biased
195-
overestimate of the true average. See
196-
https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0169649#sec013
197-
198-
:param pandas.DataFrame e: Pandas dataframe of epoch data
199-
:param bool verbose: Print verbose output
200-
201-
:return: Update DataFrame <e> columns nan values with time-of-day imputation
202-
:rtype: void
203-
"""
204-
205-
if extrapolate:
206-
# padding at the boundaries to have full 24h
207-
data = data.reindex(
208-
pd.date_range(
209-
data.index[0].floor("D"),
210-
data.index[-1].ceil("D"),
211-
freq=to_offset(pd.infer_freq(data.index)),
212-
inclusive="left",
213-
name="time",
214-
),
215-
method="nearest",
216-
tolerance=pd.Timedelta("1m"),
217-
limit=1,
218-
)
219-
220-
def fillna(subframe):
221-
# Transform will first pass the subframe column-by-column as a Series.
222-
# After passing all columns, it will pass the entire subframe again as a DataFrame.
223-
# Processing the entire subframe is optional (return value can be omitted). See 'Notes' in transform doc.
224-
if isinstance(subframe, pd.Series):
225-
x = subframe.to_numpy()
226-
nan = np.isnan(x)
227-
nanlen = len(x[nan])
228-
if 0 < nanlen < len(x): # check x contains a NaN and is not all NaN
229-
x[nan] = np.nanmean(x)
230-
return x # will be cast back to a Series automatically
231-
else:
232-
return subframe
233-
234-
data = (
235-
data
236-
# first attempt imputation using same day of week
237-
.groupby([data.index.weekday, data.index.hour, data.index.minute])
238-
.transform(fillna)
239-
# then try within weekday/weekend
240-
.groupby([data.index.weekday >= 5, data.index.hour, data.index.minute])
241-
.transform(fillna)
242-
# finally, use all other days
243-
.groupby([data.index.hour, data.index.minute])
244-
.transform(fillna)
245-
)
246-
247-
return data
248-
249-
250-
def infer_freq(x):
251-
"""Like pd.infer_freq but more forgiving"""
252-
freq, _ = stats.mode(np.diff(x), keepdims=False)
253-
freq = pd.Timedelta(freq)
254-
return freq
255-
256-
257-
def calculateECDF(x, summary):
258-
"""Calculate activity intensity empirical cumulative distribution
259-
260-
The input data must not be imputed, as ECDF requires different imputation
261-
where nan/non-wear data segments are IMPUTED FOR EACH INTENSITY LEVEL. Here,
262-
the average of similar time-of-day values is imputed with one minute
263-
granularity on different days of the measurement. Following intensity levels
264-
are calculated:
265-
1mg bins from 1-20mg
266-
5mg bins from 25-100mg
267-
25mg bins from 125-500mg
268-
100mg bins from 500-2000mg
269-
270-
:param pandas.DataFrame e: Pandas dataframe of epoch data
271-
:param str inputCol: Column to calculate intensity distribution on
272-
:param dict summary: Output dictionary containing all summary metrics
273-
274-
:return: Updated summary file
275-
:rtype: dict
276-
"""
277-
278-
levels = np.concatenate(
279-
[
280-
np.linspace(1, 20, 20), # 1mg bins from 1-20mg
281-
np.linspace(25, 100, 16), # 5mg bins from 25-100mg
282-
np.linspace(125, 500, 16), # 25mg bins from 125-500mg
283-
np.linspace(600, 2000, 15), # 100mg bins from 500-2000mg
284-
]
285-
).astype("int")
286-
287-
whrnan = x.isna().to_numpy()
288-
ecdf = x.to_numpy().reshape(-1, 1) <= levels.reshape(1, -1)
289-
ecdf[whrnan] = np.nan
290-
291-
ecdf = (
292-
pd.DataFrame(ecdf, index=x.index, columns=levels)
293-
.groupby([x.index.hour, x.index.minute])
294-
.mean() # first average is across same time of day
295-
.mean() # second average is within each level
296-
)
297-
298-
# Write to summary
299-
for level, val in ecdf.items():
300-
summary[f"{x.name}-ecdf-{level}mg"] = val
301-
302-
return summary
205+
def _daily_summary(data, data_imputed, labels, verbose=False):
206+
to_screen("=== Daily summary ===", verbose)
207+
daily_enmo = summarize_daily_enmo(data["acc"], data_imputed["acc"])
208+
daily_activity = summarize_daily_activity(data[labels], data_imputed[labels], labels)
209+
daily_summary = pd.concat([daily_enmo, daily_activity], axis=1)
210+
daily_summary.index.name = "Date"
211+
return daily_summary

0 commit comments

Comments
 (0)