Skip to content

Commit a4997f8

Browse files
authored
Merge pull request #6 from Ben-Sacks/main
Update dAUC with additional lag of 1
2 parents 3ccb597 + 6dde7a8 commit a4997f8

File tree

8 files changed

+193
-95
lines changed

8 files changed

+193
-95
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ Suggests:
4141
rmarkdown,
4242
testthat (>= 3.0.0)
4343
URL: https://github.com/Reilly-ConceptsCognitionLab/ConversationAlign
44-
RoxygenNote: 7.3.2
44+
RoxygenNote: 7.3.3
4545
LazyData: true
4646
VignetteBuilder: knitr
4747
Collate:

NAMESPACE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ importFrom(dplyr,everything)
1414
importFrom(dplyr,filter)
1515
importFrom(dplyr,first)
1616
importFrom(dplyr,group_by)
17+
importFrom(dplyr,lag)
1718
importFrom(dplyr,left_join)
1819
importFrom(dplyr,matches)
1920
importFrom(dplyr,mutate)
@@ -32,7 +33,6 @@ importFrom(purrr,map_dfr)
3233
importFrom(rlang,":=")
3334
importFrom(rlang,sym)
3435
importFrom(stats,cor.test)
35-
importFrom(stats,lag)
3636
importFrom(stats,na.omit)
3737
importFrom(stats,sd)
3838
importFrom(stringi,stri_replace_all_fixed)

R/compute_auc.R

Lines changed: 170 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,77 @@
11
#' compute_auc
22
#'
33
#' internal function that computes two indices of global alignment (auc) between conversation partners for each dyad
4-
#' @name compute_spearman
4+
#' @name compute_auc
55
#' @importFrom dplyr bind_rows
6+
#' @importFrom dplyr bind_cols
7+
#' @importFrom dplyr left_join
68
#' @importFrom dplyr mutate
9+
#' @importFrom dplyr lag
710
#' @importFrom dplyr select
11+
#' @importFrom dplyr group_by
12+
#' @importFrom dplyr ungroup
13+
#' @importFrom dplyr summarize
14+
#' @importFrom dplyr first
15+
#' @importFrom dplyr rename_with
16+
#' @importFrom dplyr across
17+
#' @importFrom dplyr everything
18+
#' @importFrom tidyselect contains
19+
#' @importFrom tidyr pivot_wider
20+
#' @importFrom tidyr fill
821
#' @importFrom magrittr %>%
22+
#' @importFrom zoo na.approx
23+
#' @importFrom DescTools AUC
924
#' @returns
1025
#' nothing - internal function used for intermediary computation piped into summarize_dyads function
1126
#' @keywords internal
1227
#' @noRd
1328

1429
compute_auc <- function(df_prep, verbose = TRUE) {
30+
# additional_lags can be added as a parameter, and will accept a vector of ints
31+
additional_lags = c(1)
32+
# define the internal functions here:
33+
# function to calculate AUC for single time series, return NA if there is an error
34+
calculate_auc <- function(domain_ts, doc_name, dimension, verbose) {
35+
tryCatch({
36+
# if time series has fewer points than the threshold, fill with NA
37+
if (max(domain_ts$Exchange_Count) < 3) { # hard coded to three exchanges
38+
#create a single row, single column dataframe with one empty value to fill in the AUC
39+
doc_domain_auc_df <- data.frame(domain_auc = as.double(NA),
40+
Exchanges = max(domain_ts$Exchange_Count))
41+
}
42+
else {
43+
domain_ts <- data.frame(domain_ts) #make single emotion Time series a data frame
44+
# na.rm is enabled, which will drop any NA values, so any differences where one of the original scores=NA
45+
domain_auc <- DescTools::AUC(x = domain_ts[,1], y = domain_ts[,2],
46+
method = "trapezoid", na.rm = T)
47+
doc_domain_auc_df <- data.frame(domain_auc,
48+
Exchanges = max(domain_ts$Exchange_Count)) #make data frame of AUC, replicated once
49+
}
50+
doc_domain_auc_df
51+
},
52+
error = function(e) {
53+
if (verbose) {
54+
message(paste("Results for dAUC will be filled with NA.\n\tTranscript:",
55+
doc_name, "\n\tDimension:", dimension))
56+
}
57+
# fill the result cell with NA
58+
doc_domain_auc_df <- data.frame(domain_auc = as.double(NA),
59+
Exchanges = max(domain_ts$Exchange_Count))
60+
doc_domain_auc_df
61+
})
62+
} # end of ccalculate_auc()
63+
1564
# selects align_var by grepping on possible prefixes of dimensions
1665
align_var <- grep("^(emo_|lex_|sem_|phon_)", colnames(df_prep), value = TRUE, ignore.case = TRUE)
1766

1867
# split the data frame into a list by event id
1968
df_list <- split(df_prep, f = df_prep$Event_ID)
2069

70+
# create a list of the first speaker in each dyad (S1) - for appending at end
71+
s1_col <- df_prep %>%
72+
dplyr::group_by(Event_ID) %>%
73+
dplyr::summarize(S1 = unique(Participant_ID)[1])
74+
2175
# iterate over each event, replacing participant names with a transient filler variable
2276
df_list_speakvar <- lapply(df_list, function(df){
2377
svec <- unique(as.character(df$Participant_ID))
@@ -76,101 +130,137 @@ compute_auc <- function(df_prep, verbose = TRUE) {
76130
# join the two participant data frame together by dyad and Exchange_Count
77131
widedf <- dplyr::left_join(split_pid_df_list[[1]], split_pid_df_list[[2]], by = c("Event_ID", "Participant_Pair", "Exchange_Count"))
78132

79-
#iterate over each aligned dimension, selecting only the scores for that dimension and pulling a difference value and subbing it in for the actual values
80-
for (dimension in align_var){
81-
both_participant_cols <- widedf %>%
82-
dplyr::select(starts_with(dimension))
83-
# add a count to the data frame (x-axis)
84-
85-
absdiffcol <- data.frame(dimension = abs(both_participant_cols[,1] - both_participant_cols[,2]))
86-
widedf[which(colnames(widedf) %in% paste(dimension, c("S1", "S2"), sep = "_"))] <- absdiffcol
133+
# create a list of wide data frames
134+
wide_df_list <- list(widedf)
135+
# get the length of the wide df list before adding lags - for naming columns
136+
start_wide_df_length <- length(wide_df_list)
137+
# if given, lag time series in widedf and add to list
138+
if (length(additional_lags > 0)) {
139+
# iteration for additional
140+
for (lag in additional_lags){
141+
lagdf <- widedf
142+
val <- abs(lag)
143+
# if positive lag all S1, if negative, lag all S2
144+
if (lag > 0) {
145+
lagdf <- widedf %>%
146+
dplyr::group_by(Event_ID) %>%
147+
dplyr::mutate(dplyr::across(tidyselect::contains("S1"), ~dplyr::lag(., n = val))) %>%
148+
dplyr::ungroup()
149+
}
150+
else if (lag < 0) {
151+
lagdf <- widedf %>%
152+
dplyr::group_by(Event_ID) %>%
153+
dplyr::mutate(dplyr::across(tidyselect::contains("S2"), ~dplyr::lag(., n = val))) %>%
154+
dplyr::ungroup()
155+
}
156+
wide_df_list[[length(wide_df_list)+1]] <- lagdf
157+
}
87158
}
88159

89-
long_diff_df <- widedf %>%
90-
tidyr::pivot_longer(cols = c(ends_with("_S1") | ends_with("_S2")),
91-
names_to = c("dimension", "Participant_ID"),
92-
names_pattern = "(.*)_([^_]+)$",
93-
values_to = "score") %>% # pivot longer by dimension and pid
94-
# pivot each dimension to a column
95-
tidyr::pivot_wider(names_from = dimension, values_from = score) %>%
96-
dplyr::filter(Participant_ID == "S1") %>% # remove every second row per turn
97-
dplyr::select(-Participant_ID) # remove pid column
98-
# split the difference data frame into a list based on event id
99-
long_diff_df_list <- split(long_diff_df, f = long_diff_df$Event_ID)
100-
101-
# grab the aligned dimensions as a vector to iterate over
102-
xdimensions <- colnames(long_diff_df)[which(colnames(long_diff_df) %in% align_var)]
103-
104-
domain_auc_list <- lapply(xdimensions, function(dimension){ #iterate over emotion
105-
# now iterate over each dyad in the corpus
106-
107-
# function to calculate AUC for single time series, return NA if there is an error
108-
calculate_auc <- function(domain_ts, doc_name, dimension) {
109-
tryCatch({
110-
# if time series has fewer points than the threshold, fill with NA
111-
if (max(domain_ts$Exchange_Count) < 3) { # hard coded to three exchanges
112-
#create a single row, single column dataframe with one empty value to fill in the AUC
113-
doc_domain_auc_df <- data.frame(domain_auc = as.double(NA),
114-
Exchanges = max(domain_ts$Exchange_Count))
115-
}
116-
else {
117-
domain_ts <- data.frame(domain_ts) #make single emotion Time series a data frame
118-
domain_auc <- DescTools::AUC(x = domain_ts[,1], y = domain_ts[,2], method = "trapezoid")
119-
doc_domain_auc_df <- data.frame(domain_auc,
120-
Exchanges = max(domain_ts$Exchange_Count)) #make data frame of AUC, replicated once
121-
}
122-
doc_domain_auc_df
123-
},
124-
error = function(e) {
125-
if (verbose) {
126-
message(paste("Results for dAUC will be filled with NA.\n\tTranscript:",
127-
doc_name, "\n\tDimension:", dimension))
128-
}
129-
# fill the result cell with NA
130-
doc_domain_auc_df <- data.frame(domain_auc = as.double(NA),
131-
Exchanges = max(domain_ts$Exchange_Count))
132-
doc_domain_auc_df
133-
})
160+
# iterate over the full computation of dAUC for each lag
161+
list_of_each_dauc_df <- lapply(seq_along(wide_df_list), function(i) {
162+
widedf <- wide_df_list[[i]]
163+
164+
#iterate over each aligned dimension, selecting only the scores for that dimension and pulling a difference value and subbing it in for the actual values
165+
for (dimension in align_var){
166+
both_participant_cols <- widedf %>%
167+
dplyr::select(starts_with(dimension))
168+
absdiffcol <- data.frame(dimension = abs(both_participant_cols[,1] - both_participant_cols[,2]))
169+
widedf[which(colnames(widedf) %in% paste(dimension, c("S1", "S2"), sep = "_"))] <- absdiffcol
134170
}
135171

136-
# iterate over each document
137-
single_doc_auc <- lapply(long_diff_df_list, function(df_prep){
172+
long_diff_df <- widedf %>%
173+
tidyr::pivot_longer(cols = c(ends_with("_S1") | ends_with("_S2")),
174+
names_to = c("dimension", "Participant_ID"),
175+
names_pattern = "(.*)_([^_]+)$",
176+
values_to = "score") %>% # pivot longer by dimension and pid
177+
# pivot each dimension to a column
178+
tidyr::pivot_wider(names_from = dimension, values_from = score) %>%
179+
dplyr::filter(Participant_ID == "S1") %>% # remove every second row per turn
180+
dplyr::select(-Participant_ID) # remove pid column
181+
# split the difference data frame into a list based on event id
182+
long_diff_df_list <- split(long_diff_df, f = long_diff_df$Event_ID)
183+
184+
# grab the aligned dimensions as a vector to iterate over
185+
xdimensions <- colnames(long_diff_df)[which(colnames(long_diff_df) %in% align_var)]
138186

139-
domain_ts <- df_prep %>%
140-
dplyr::select(Exchange_Count,
141-
tidyselect::contains(dimension)) # take dimension and time
187+
# NOTE: calculate_auc function is defined here
142188

143-
# put the function in here
144-
single_doc_domain_auc <- calculate_auc(domain_ts,
145-
doc_name = as.character(df_prep$Event_ID)[1],
146-
dimension = dimension)
189+
domain_auc_list <- lapply(xdimensions, function(dimension){ #iterate over emotion
190+
# iterate over each document
191+
single_doc_auc <- lapply(long_diff_df_list, function(df_prep){
192+
193+
domain_ts <- df_prep %>%
194+
dplyr::select(Exchange_Count,
195+
tidyselect::contains(dimension)) # take dimension and time
196+
197+
# put the function in here
198+
single_doc_domain_auc <- calculate_auc(domain_ts,
199+
doc_name = as.character(df_prep$Event_ID)[1],
200+
dimension = dimension,
201+
verbose = verbose)
202+
})
203+
#bind all docs AUCs for emotion into one column and add column prefix
204+
all_doc_domain_auc_df <- dplyr::bind_rows(single_doc_auc)
205+
# drop exchange column for all but first dimension
206+
if (dimension != xdimensions[1]) {
207+
all_doc_domain_auc_df <- data.frame(all_doc_domain_auc_df[,1])
208+
}
209+
colnames(all_doc_domain_auc_df)[1] <- paste("AUC", dimension, sep = "_")
210+
all_doc_domain_auc_df
147211
})
148-
#bind all docs AUCs for emotion into one column and add column prefix
149-
all_doc_domain_auc_df <- dplyr::bind_rows(single_doc_auc)
150-
# drop exchange column for all but first dimension
151-
if (dimension != xdimensions[1]) {
152-
all_doc_domain_auc_df <- data.frame(all_doc_domain_auc_df[,1])
212+
213+
all_domain_df <- dplyr::bind_cols(domain_auc_list, data.frame(Event_ID = names(long_diff_df_list))) #bind all columns of AUCs into one data frame
214+
# standardize each AUC to 50
215+
all_domain_df_s <- all_domain_df %>%
216+
dplyr::mutate(dplyr::across(tidyselect::contains("AUC"), ~ (50 * .x) / Exchanges)) %>%
217+
dplyr::select(-Exchanges)
218+
219+
output_auc <- dplyr::left_join(all_domain_df, all_domain_df_s,
220+
by = "Event_ID", suffix = c("_raw", "_scaled100")) %>%
221+
dplyr::select(c(Event_ID, Exchanges, dplyr::everything()))
222+
223+
224+
# add lag to columns if there is a lag
225+
if (i > start_wide_df_length) {
226+
# get lag from index
227+
lag_index <- i - start_wide_df_length
228+
output_auc <- output_auc %>%
229+
dplyr::rename_with(~ paste0(., "_Lag",
230+
as.character(additional_lags[lag_index])),
231+
tidyselect::starts_with("AUC"))
232+
}
233+
# if not the first iteration, remove event id and exchanges
234+
if (i > 1) {
235+
output_auc <- output_auc %>%
236+
dplyr::select(-c(Event_ID, Exchanges))
153237
}
154-
colnames(all_doc_domain_auc_df)[1] <- paste("AUC", dimension, sep = "_")
155-
all_doc_domain_auc_df
238+
output_auc
239+
})
240+
241+
# combine the data frames from different lagged time series
242+
combined_lag_df <- dplyr::bind_cols(list_of_each_dauc_df) %>%
243+
dplyr::mutate(Talked_First = s1_col$S1, .after = Exchanges)
244+
# sub out negative lag colnames for lead
245+
colnames(combined_lag_df) <- gsub("_Lag-", "_Lead", colnames(combined_lag_df))
246+
# set the names of zero lag columsn to immediate
247+
immediate_ind <- grep(".*(raw|scaled100)$", colnames(combined_lag_df), ignore.case = F)
248+
#print(immediate_ind)
249+
colnames(combined_lag_df)[immediate_ind] <- sapply(colnames(combined_lag_df)[immediate_ind], function(x){
250+
paste0(x, "_Immediate")
156251
})
157252

158-
all_domain_df <- dplyr::bind_cols(domain_auc_list, data.frame(Event_ID = names(long_diff_df_list))) #bind all columns of AUCs into one data frame
253+
#colnames(combined_lag_df)[immediate_ind] <- new_names
254+
159255
# throw warning if any dyads are fewer than 50 exchanges
160-
small_dyads <- all_domain_df[which(all_domain_df$Exchanges < 50), "Event_ID"]
256+
small_dyads <- combined_lag_df[which(combined_lag_df$Exchanges < 50), "Event_ID"]
257+
small_dyads <- unique(small_dyads)
161258
if (length(small_dyads) > 0) {
162259
warning(paste0("Some conversations are shorter than 50 exchanges (100 turns). ",
163260
"It is recommended that conversations are longer than 50 exchanges. ",
164261
"Affected conversations:\n",
165262
paste(small_dyads, collapse = ", ")))
166263
}
167-
# standardize each AUC to 50
168-
all_domain_df_s <- all_domain_df %>%
169-
dplyr::mutate(dplyr::across(tidyselect::contains("AUC"), ~ (50 * .x) / Exchanges)) %>%
170-
dplyr::select(-Exchanges)
171-
172-
output_auc <- dplyr::left_join(all_domain_df, all_domain_df_s, by = "Event_ID", suffix = c("_raw", "_scaled100")) %>%
173-
dplyr::select(c(Event_ID, Exchanges, dplyr::everything()))
174264

175-
return(output_auc)
265+
return(combined_lag_df)
176266
}

R/compute_lagcorr.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ compute_lagcorr <- function(df_prep, lags = c(-2, 0, 2), corr_type = "Pearson")
4141

4242
# Iterate over each split data frame
4343
output_df_list <- lapply(df_list, function(df) {
44-
# Establish participant names and S1/S2 keys
44+
# Establish participant names and S1/S2 keys according to who speaks first
4545
participantvec <- unique(df$Participant_ID)
4646
names(participantvec) <- c("S1", "S2")
4747
df$Participant_ID <- gsub(participantvec[1], names(participantvec)[1], df$Participant_ID)

R/prep_dyads.R

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
#' @importFrom dplyr na_if
2222
#' @importFrom dplyr ungroup
2323
#' @importFrom magrittr %>%
24-
#' @importFrom stats lag
2524
#' @importFrom stringi stri_replace_all_fixed
2625
#' @importFrom stringi stri_replace_all_regex
2726
#' @importFrom stringr str_trim

R/summarize_dyads.R

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
#' @export summarize_dyads
4242

4343
summarize_dyads <- function(df_prep, custom_lags = NULL, sumdat_only = TRUE, corr_type = 'Pearson') {
44+
# prevent note - no visible binding
45+
Lag <- Talked_First <- NULL
4446
# Validate correlation type at the start
4547
if (!corr_type %in% c("Pearson", "Spearman")) {
4648
stop("corr_type must be either 'Pearson' or 'Spearman'")
@@ -116,20 +118,21 @@ summarize_dyads <- function(df_prep, custom_lags = NULL, sumdat_only = TRUE, cor
116118
auc_df_long <- auc_df %>%
117119
tidyr::pivot_longer(
118120
contains("AUC"),
119-
names_to = c("Dimension", "reshaped"),
120-
names_pattern = "AUC_(.*)_(raw|scaled100)",
121+
names_to = c("Dimension", "reshaped", "Lag"),
122+
names_pattern = "AUC_(.*)_(raw|scaled100)_(Immediate|Lag\\d|Lead\\d)",
121123
values_to = "AUC"
122124
) %>%
123-
tidyr::pivot_wider(
124-
names_from = reshaped,
125+
tidyr::pivot_wider( # pivot out the reshaped and lag columns
126+
names_from = c(reshaped, Lag),
125127
names_prefix = "AUC_",
126128
values_from = AUC
127129
)
128130

129131
# Combine all data frames - with conditional join
130132
df_summarize <- av_df %>%
131133
dplyr::left_join(auc_df_long, by = c("Event_ID", "Dimension")) %>%
132-
dplyr::left_join(covar_df, by = c("Event_ID", "Dimension"))
134+
dplyr::left_join(covar_df, by = c("Event_ID", "Dimension")) %>%
135+
dplyr::relocate(Talked_First, .after = Participant_ID) # move Talked_First to after Participant ID
133136

134137
# Only join metadata if there were columns to summarize
135138
if (length(meta_cols) > 0) {

0 commit comments

Comments
 (0)